Skip parenthesis at FirstToken(State).

fnl · Jan 26, 2022 · b74e65e · b74e65e
1 parent 3f3a045
commit b74e65e
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 9 deletions.
diff --git a/syntok/_segmentation_states.py b/syntok/_segmentation_states.py
@@ -258,7 +258,7 @@ def __find_next_token_after_bracket(self) -> str:
         else:
             return ""
 
-    def __skip_bracketed_text(self) -> bool:
+    def _skip_bracketed_text(self) -> bool:
         """
         Move over bracketed text if not too long and not looking like a sentence,
         when next is an opening bracket.
@@ -334,7 +334,7 @@ def _move_and_skip_bracketed_text(self) -> bool:
         """Advance the queue, and also skip over bracketed text if applicable."""
         if self._move():
             if self.next_is_an_opening_bracket:
-                self.__skip_bracketed_text()
+                self._skip_bracketed_text()
 
         if not self.__queue:
             return self._fetch_next()
@@ -425,7 +425,7 @@ def __move_to_next_relevant_word_and_return_token_after_terminal(self) -> str:
         token = None
 
         if self.next_is_an_opening_bracket and self.last not in State.terminals:
-            self.__skip_bracketed_text()
+            self._skip_bracketed_text()
         else:
             while self.next_is_a_post_terminal_symbol_part_of_sentence:
                 if not self._move():
@@ -463,6 +463,10 @@ def is_single_letter_or_roman_numeral(token):
 class FirstToken(State):
     def __next__(self) -> State:
         if not self.is_empty or self._fetch_next():
+            # If a sentence is opened by parenthesis, treat the whole as its own sentence.
+            if self.next_is_an_opening_bracket and self._skip_bracketed_text() and len(self._history) > 3:
+                return Terminal(self._stream, self._queue, self._history)
+
             self._move()  # Do not skip parenthesis if they open the sentence.
 
             if self.next_is_a_terminal:

diff --git a/syntok/segmenter_test.py b/syntok/segmenter_test.py
@@ -71,19 +71,19 @@
 Now only this splits: the EU.
 A sentence ending in U.S. Another that will not split.
 12 monkeys ran into here.
-Nested
-(Parenthesis.
-(With words inside!
-(Right.))
-(More stuff.
-Uff, this is it!))
+Nested (Parenthesis. (With words inside! (Right.)) (More. This is it!))
 In the Big City.
+(This is a very long sentence inside parenthesis.
+Followed by another, so we want to split them.)
 How we got an A. Mathematics . dot times.
 An abbreviation at the end..
 This is a sentence terminal ellipsis...
 This is another sentence terminal ellipsis....
 An easy to handle G. species mention.
 Am 13. Jän. 2006 war es regnerisch.
+(Phil. 4:8)
+(Oh. Again!)
+Syntok even handles bible quotes!
 The basis for Lester B. Pearson's policy was later.
 This model was introduced by Dr. Edgar F. Codd after initial criticisms.
 This quote "He said it." is actually inside.
@@ -371,6 +371,15 @@ def test_abbreviation_followed_by_parenthesis(self):
         result = segmenter.split(iter(tokens))
         self.assertEqual([tokens], result)
 
+    def test_do_not_split_bible_citation(self):
+        tokens = Tokenizer().split(
+            "This is a bible quote? (Phil. 4:8) Yes, it is!"
+        )
+        result = segmenter.split(iter(tokens))
+        self.assertEqual(len(result[0]), 6)
+        self.assertEqual(len(result[1]), 5)
+        self.assertEqual(len(result[2]), 5)
+
     def test_do_not_split_short_text_inside_parenthesis(self):
         tokens = Tokenizer().split(
             "This is (Proc. ABC with Abs. Reg. Compliance) not here."