Skip to content

Commit

Permalink
moving T5Tokenizer.cleanup into standardize_punct method
Browse files Browse the repository at this point in the history
  • Loading branch information
chanind committed May 9, 2022
1 parent 5f4a723 commit 12ccf38
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
5 changes: 4 additions & 1 deletion frame_semantic_transformer/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import re
from typing import Iterator, Sequence, TypeVar

from transformers import T5Tokenizer

T = TypeVar("T")


Expand All @@ -14,8 +16,9 @@ def standardize_punct(sent: str) -> str:
"""
Try to standardize things like "He 's a man" -> "He's a man"
"""
updated_sent = T5Tokenizer.clean_up_tokenization(sent)
# remove space before punct
updated_sent = re.sub(r"([a-zA-Z0-9])\s+(\*?[.',:?])", r"\1\2", sent)
updated_sent = re.sub(r"([a-zA-Z0-9])\s+(\*?[.',:?])", r"\1\2", updated_sent)
# remove repeated *'s
updated_sent = re.sub(r"\*+", "*", updated_sent)
# fix spaces in contractions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from dataclasses import dataclass
import re

from transformers import T5Tokenizer
from frame_semantic_transformer.data.data_utils import standardize_punct

from frame_semantic_transformer.data.task_samples.TaskSample import TaskSample
Expand Down Expand Up @@ -69,4 +68,4 @@ def process_text_for_evaluation(sent: str) -> str:
updated_sent = standardize_punct(sent)
updated_sent = re.sub(r"\*\s+([a-zA-Z0-9])", r"*\1", updated_sent)
updated_sent = re.sub(r"([a-zA-Z0-9])(\*?')", r"\1 \2", updated_sent)
return T5Tokenizer.clean_up_tokenization(updated_sent)
return updated_sent

0 comments on commit 12ccf38

Please sign in to comment.