Skip to content

Commit

Permalink
Skip parenthesis at FirstToken(State).
Browse files Browse the repository at this point in the history
  • Loading branch information
fnl committed Jan 26, 2022
1 parent 3f3a045 commit b74e65e
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 9 deletions.
10 changes: 7 additions & 3 deletions syntok/_segmentation_states.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def __find_next_token_after_bracket(self) -> str:
else:
return ""

def __skip_bracketed_text(self) -> bool:
def _skip_bracketed_text(self) -> bool:
"""
Move over bracketed text if not too long and not looking like a sentence,
when next is an opening bracket.
Expand Down Expand Up @@ -334,7 +334,7 @@ def _move_and_skip_bracketed_text(self) -> bool:
"""Advance the queue, and also skip over bracketed text if applicable."""
if self._move():
if self.next_is_an_opening_bracket:
self.__skip_bracketed_text()
self._skip_bracketed_text()

if not self.__queue:
return self._fetch_next()
Expand Down Expand Up @@ -425,7 +425,7 @@ def __move_to_next_relevant_word_and_return_token_after_terminal(self) -> str:
token = None

if self.next_is_an_opening_bracket and self.last not in State.terminals:
self.__skip_bracketed_text()
self._skip_bracketed_text()
else:
while self.next_is_a_post_terminal_symbol_part_of_sentence:
if not self._move():
Expand Down Expand Up @@ -463,6 +463,10 @@ def is_single_letter_or_roman_numeral(token):
class FirstToken(State):
def __next__(self) -> State:
if not self.is_empty or self._fetch_next():
# If a sentence is opened by parenthesis, treat the whole as its own sentence.
if self.next_is_an_opening_bracket and self._skip_bracketed_text() and len(self._history) > 3:
return Terminal(self._stream, self._queue, self._history)

self._move() # Do not skip parenthesis if they open the sentence.

if self.next_is_a_terminal:
Expand Down
21 changes: 15 additions & 6 deletions syntok/segmenter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,19 +71,19 @@
Now only this splits: the EU.
A sentence ending in U.S. Another that will not split.
12 monkeys ran into here.
Nested
(Parenthesis.
(With words inside!
(Right.))
(More stuff.
Uff, this is it!))
Nested (Parenthesis. (With words inside! (Right.)) (More. This is it!))
In the Big City.
(This is a very long sentence inside parenthesis.
Followed by another, so we want to split them.)
How we got an A. Mathematics . dot times.
An abbreviation at the end..
This is a sentence terminal ellipsis...
This is another sentence terminal ellipsis....
An easy to handle G. species mention.
Am 13. Jän. 2006 war es regnerisch.
(Phil. 4:8)
(Oh. Again!)
Syntok even handles bible quotes!
The basis for Lester B. Pearson's policy was later.
This model was introduced by Dr. Edgar F. Codd after initial criticisms.
This quote "He said it." is actually inside.
Expand Down Expand Up @@ -371,6 +371,15 @@ def test_abbreviation_followed_by_parenthesis(self):
result = segmenter.split(iter(tokens))
self.assertEqual([tokens], result)

def test_do_not_split_bible_citation(self):
tokens = Tokenizer().split(
"This is a bible quote? (Phil. 4:8) Yes, it is!"
)
result = segmenter.split(iter(tokens))
self.assertEqual(len(result[0]), 6)
self.assertEqual(len(result[1]), 5)
self.assertEqual(len(result[2]), 5)

def test_do_not_split_short_text_inside_parenthesis(self):
tokens = Tokenizer().split(
"This is (Proc. ABC with Abs. Reg. Compliance) not here."
Expand Down

0 comments on commit b74e65e

Please sign in to comment.