diff --git a/tests/test_splitter.py b/tests/test_splitter.py new file mode 100644 index 0000000..05d8a9a --- /dev/null +++ b/tests/test_splitter.py @@ -0,0 +1,198 @@ +"""Test cases for the Splitter class.""" +import json +import unittest +from unittest import mock +from unittest.mock import patch, mock_open + +import requests +from pysentence_similarity import Splitter + + +class TestSplitter(unittest.TestCase): + """Test cases for the Splitter class.""" + + def setUp(self) -> None: + """Set up the test case by creating an instance of Splitter.""" + self.splitter = Splitter( + markers_to_split=[".", "?", "!"], + preserve_markers=True + ) + + def test_split_from_text(self) -> None: + """Test splitting text into sentences.""" + text = "Hello world! How are you? I'm fine." + expected = ["Hello world!", "How are you?", "I'm fine."] + result = self.splitter.split_from_text(text) + self.assertEqual(result, expected) + + def test_split_from_text_empty(self) -> None: + """Test splitting an empty string.""" + text = "" + result = self.splitter.split_from_text(text) + self.assertEqual(result, []) + + def test_split_from_text_whitespace(self) -> None: + """Test splitting a string with only whitespaces.""" + text = " " + result = self.splitter.split_from_text(text) + self.assertEqual(result, []) + + def test_split_from_file(self) -> None: + """Test splitting text from a file.""" + text = "Hello. This is a test file.\nHow are you?" + expected = ["Hello.", "This is a test file.", "How are you?"] + + with patch("builtins.open", mock_open(read_data=text)) as mock_file: + result = self.splitter.split_from_file("fake_file.txt") + mock_file.assert_called_once_with( + "fake_file.txt", "r", encoding="utf-8") + self.assertEqual(result, expected) + + @patch("requests.get") + def test_split_from_url(self, mock_get) -> None: + """Test splitting text from a URL with a mocked HTTP request.""" + # Mock response content with HTML + mock_html = ( + "Hello world. How are you?
" + "I'm fine." + ) + mock_response = mock_get.return_value + mock_response.content = mock_html.encode('utf-8') + mock_response.raise_for_status = unittest.mock.Mock() + + expected = ["Hello world.", "How are you?", "I'm fine."] + result = self.splitter.split_from_url("http://example.com") + + mock_get.assert_called_once_with("http://example.com", timeout=10) + self.assertEqual(result, expected) + + @patch("requests.get") + def test_split_from_url_http_error(self, mock_get) -> None: + """Test handling an HTTP error in split_from_url.""" + mock_get.side_effect = requests.exceptions.HTTPError("404 Not Found") + + with self.assertRaises(requests.exceptions.HTTPError): + self.splitter.split_from_url("http://example.com") + + def test_split_from_url_invalid_url(self) -> None: + """Test handling invalid URL type in split_from_url.""" + with self.assertRaises(TypeError): + self.splitter.split_from_url(12345) + + def test_split_from_url_invalid_timeout(self) -> None: + """Test handling invalid timeout type in split_from_url.""" + with self.assertRaises(TypeError): + self.splitter.split_from_url("http://example.com", timeout="five") + + @patch( + "builtins.open", + new_callable=mock_open, + read_data='text1,text2\nHello World!,"This is a test."\n' + ) + def test_split_from_csv_valid(self, mock_file) -> None: + """Test valid CSV input with multiple columns.""" + result = self.splitter.split_from_csv( + "fake_path.csv", ["text1", "text2"]) + expected_result = ["Hello World!", "This is a test."] + self.assertEqual(result, expected_result) + mock_file.assert_called_once_with( + "fake_path.csv", 'r', encoding='utf-8') + + @patch("builtins.open", new_callable=mock_open) + def test_split_from_csv_empty_file(self, mock_file) -> None: + """Test handling of an empty CSV file.""" + mock_file.return_value.read.side_effect = '' + with self.assertRaises(ValueError): + self.splitter.split_from_csv("fake_path.csv", ["text1", "text2"]) + + @patch( + "builtins.open", + new_callable=mock_open, + read_data='text1,text2\nHello World!,"This is a test."\n' + ) + def test_split_from_csv_missing_column(self, mock_file) -> None: + """Test handling of a missing column in the CSV.""" + with self.assertRaises(ValueError): + self.splitter.split_from_csv( + "fake_path.csv", ["text1", "missing_column"]) + + def test_split_from_csv_invalid_file_path(self) -> None: + """Test handling of an invalid file path.""" + with self.assertRaises(FileNotFoundError): + self.splitter.split_from_csv("invalid_path.csv", ["text1"]) + + def test_split_from_csv_invalid_column_names(self) -> None: + """Test handling of invalid column names argument.""" + with self.assertRaises(TypeError): + self.splitter.split_from_csv("fake_path.csv", "text1") + + def test_split_from_csv_non_string_column_names(self) -> None: + """Test handling of non-string column names.""" + with self.assertRaises(TypeError): + self.splitter.split_from_csv("fake_path.csv", [123]) + + @mock.patch( + 'builtins.open', + new_callable=mock_open, + read_data=json.dumps({ + "key1": "This is the first sentence. This is the second sentence.", + "key2": "Another sentence here." + })) + def test_split_from_json_valid(self, mock_file) -> None: + """Test splitting sentences from a valid JSON file.""" + result = self.splitter.split_from_json( + "dummy_path.json", ["key1", "key2"]) + expected = [ + "This is the first sentence.", + "This is the second sentence.", + "Another sentence here." + ] + self.assertEqual(result, expected) + + @mock.patch('builtins.open', new_callable=mock_open, read_data=json.dumps({ + "key1": "", + "key2": " " + })) + def test_split_from_json_empty_keys(self, mock_file) -> None: + """Test handling of empty strings in keys.""" + result = self.splitter.split_from_json( + "dummy_path.json", ["key1", "key2"]) + self.assertEqual(result, []) + + @mock.patch('builtins.open', new_callable=mock_open, read_data=json.dumps({ + "key1": "This is a test sentence." + })) + def test_split_from_json_missing_key(self, mock_file) -> None: + """Test handling of a missing key in the JSON.""" + result = self.splitter.split_from_json( + "dummy_path.json", ["key1", "missing_key"]) + expected = ["This is a test sentence."] + self.assertEqual(result, expected) + + @mock.patch('builtins.open', side_effect=FileNotFoundError) + def test_split_from_json_file_not_found(self, mock_file) -> None: + """Test handling of a file not found error.""" + with self.assertRaises(FileNotFoundError): + self.splitter.split_from_json("dummy_path.json", ["key1"]) + + @mock.patch( + 'builtins.open', new_callable=mock_open, read_data='not a json' + ) + def test_split_from_json_json_decode_error(self, mock_file) -> None: + """Test handling of JSON decode error.""" + with self.assertRaises(json.JSONDecodeError): + self.splitter.split_from_json("dummy_path.json", ["key1"]) + + def test_split_from_json_invalid_file_path_type(self) -> None: + """Test passing an invalid file path type.""" + with self.assertRaises(TypeError): + self.splitter.split_from_json(123, ["key1"]) + + def test_split_from_json_invalid_keys_type(self) -> None: + """Test passing an invalid keys type.""" + with self.assertRaises(TypeError): + self.splitter.split_from_json("dummy_path.json", "key1") + + +if __name__ == "__main__": + unittest.main()