Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clean_up_tokenization_spaces=False if unset #31938

Merged
merged 10 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/transformers/models/bert/tokenization_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ class BertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -105,6 +108,7 @@ def __init__(
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
Expand Down Expand Up @@ -136,6 +140,7 @@ def __init__(
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/convbert/tokenization_convbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ class ConvBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original ConvBERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -108,6 +111,7 @@ def __init__(
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
Expand Down Expand Up @@ -139,6 +143,7 @@ def __init__(
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/distilbert/tokenization_distilbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ class DistilBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -108,6 +111,7 @@ def __init__(
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
Expand Down Expand Up @@ -138,6 +142,7 @@ def __init__(
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/electra/tokenization_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ class ElectraTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original Electra).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -107,6 +110,7 @@ def __init__(
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
Expand Down Expand Up @@ -138,6 +142,7 @@ def __init__(
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/funnel/tokenization_funnel.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ class FunnelTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -127,6 +130,7 @@ def __init__(
eos_token="</s>",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
Expand Down Expand Up @@ -159,6 +163,7 @@ def __init__(
eos_token=eos_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/layoutlm/tokenization_layoutlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original LayoutLM).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -108,6 +111,7 @@ def __init__(
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
Expand Down Expand Up @@ -139,6 +143,7 @@ def __init__(
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/lxmert/tokenization_lxmert.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ class LxmertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original Lxmert).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -107,6 +110,7 @@ def __init__(
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
Expand Down Expand Up @@ -138,6 +142,7 @@ def __init__(
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/mobilebert/tokenization_mobilebert.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ class MobileBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original MobileBERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -109,6 +112,7 @@ def __init__(
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
Expand Down Expand Up @@ -140,6 +144,7 @@ def __init__(
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/mpnet/tokenization_mpnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ class MPNetTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -128,6 +131,7 @@ def __init__(
mask_token="<mask>",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
Expand Down Expand Up @@ -170,6 +174,7 @@ def __init__(
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/prophetnet/tokenization_prophetnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,9 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -330,6 +333,7 @@ def __init__(
mask_token: Optional[str] = "[MASK]",
tokenize_chinese_chars: Optional[bool] = True,
strip_accents: Optional[bool] = None,
clean_up_tokenization_spaces: bool = True,
**kwargs,
):
if not os.path.isfile(vocab_file):
Expand Down Expand Up @@ -360,6 +364,7 @@ def __init__(
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original SqueezeBERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -108,6 +111,7 @@ def __init__(
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
Expand Down Expand Up @@ -139,6 +143,7 @@ def __init__(
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/tapas/tokenization_tapas.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,9 @@ class TapasTokenizer(PreTrainedTokenizer):
Minimum length of each question in terms of tokens (will be skipped otherwise).
max_question_length (`int`, *optional*):
Maximum length of each question in terms of tokens (will be skipped otherwise).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand Down Expand Up @@ -252,6 +255,7 @@ def __init__(
max_question_length=None,
model_max_length: int = 512,
additional_special_tokens: Optional[List[str]] = None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not is_pandas_available():
Expand Down Expand Up @@ -322,6 +326,7 @@ def __init__(
max_question_length=max_question_length,
model_max_length=model_max_length,
additional_special_tokens=additional_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

Expand Down
6 changes: 5 additions & 1 deletion src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1607,7 +1607,11 @@ def __init__(self, **kwargs):
)

# By default, cleaning tokenization spaces for both fast and slow tokenizers
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
itazap marked this conversation as resolved.
Show resolved Hide resolved
itazap marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn(
"The `clean_up_tokenization_spaces` argument will soon be deprecated. It currently defaults to False if not passed.",
FutureWarning,
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's not warn, we won't remove it!

Suggested change
warnings.warn(
"The `clean_up_tokenization_spaces` argument will soon be deprecated. It currently defaults to False if not passed.",
FutureWarning,
)


# By default, do not split special tokens for both fast and slow tokenizers
self.split_special_tokens = kwargs.pop("split_special_tokens", False)
Expand Down
2 changes: 1 addition & 1 deletion tests/models/clvp/test_tokenization_clvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def get_tokenizer(self, **kwargs):
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
output_text = "lower[SPACE]newer"
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

testing sets a small vocab here, so this should be the expected behaviour. see unmodified test ClvpTokenizationTest --> test_full_tokenizer for an example where [SPACE] was expected

return input_text, output_text

# Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens
Expand Down
8 changes: 4 additions & 4 deletions tests/models/wav2vec2/test_tokenization_wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@ def test_tokenizer_decode_added_tokens(self):
batch_tokens = tokenizer.batch_decode(sample_ids)
batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True)

self.assertEqual(batch_tokens, ["HELLO<unk>!?!?$$$", "BYE BYE<unk>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!?!?", "BYE BYE"])
self.assertEqual(batch_tokens, ["HELLO<unk>!? !?$$$", "BYE BYE<unk>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!? !?", "BYE BYE"])

def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
Expand Down Expand Up @@ -467,8 +467,8 @@ def test_tokenizer_decode_added_tokens(self):
batch_tokens = tokenizer.batch_decode(sample_ids)
batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True)

self.assertEqual(batch_tokens, ["HELLO<unk>!?!?<new_tokens>$$$", "BYE BYE<unk><new_tokens>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!?!?<new_tokens>", "BYE BYE<new_tokens>"])
self.assertEqual(batch_tokens, ["HELLO<unk>!? !?<new_tokens>$$$", "BYE BYE<unk><new_tokens>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!? !?<new_tokens>", "BYE BYE<new_tokens>"])
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the tokenizer.word_delimiter_token is replaced with . See :

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
        Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
        """
      ...
        # replace delimiter token
        string = "".join([" " if token == self.word_delimiter_token else token for token in filtered_tokens]).strip()

        if self.do_lower_case:
            string = string.lower()

        return string
        

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should not have to do this! maybe cleanup tokenization should be true for wav2vec2 no?

Copy link
Collaborator Author

@itazap itazap Sep 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought so too but the sample_ids have tokenizer.word_delimiter_token_id which is a space " " so I think it would be expected in the output? wdyt @ArthurZucker


def test_special_characters_in_vocab(self):
sent = "ʈʰ æ æ̃ ˧ kʰ"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def test_tokenizer_decode_added_tokens(self):
# fmt: on

batch_tokens = tokenizer.batch_decode(sample_ids)
self.assertEqual(batch_tokens, ["k s ɾ ɾ l ɭʲ!?!? $$$", "j ð s j ð s oːɹ $$$"])
self.assertEqual(batch_tokens, ["k s ɾ ɾ l ɭʲ ! ? ! ? $$$", "j ð s j ð s oːɹ $$$"])

@staticmethod
def get_from_offsets(offsets, key):
Expand Down
Loading