diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 707bfae89db5..9eed8cfb42c0 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -1319,7 +1319,10 @@ def tokenizer(self, proto): raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) - + user_defined_symbols = [ + AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols + ] + tokenizer.add_tokens(user_defined_symbols) return tokenizer diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index e99104f7f6dd..0cee3347c408 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -52,7 +52,7 @@ @require_sentencepiece @require_tokenizers class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - from_pretrained_id = "hf-internal-testing/llama-tokenizer" + from_pretrained_id = ["hf-internal-testing/llama-tokenizer", "meta-llama/Llama-2-7b-hf"] tokenizer_class = LlamaTokenizer rust_tokenizer_class = LlamaTokenizerFast diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index bc169332d3fc..fa1be251e0d8 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -51,6 +51,7 @@ get_tests_dir, is_pt_tf_cross_test, require_jinja, + require_read_token, require_tf, require_tokenizers, require_torch, @@ -200,13 +201,19 @@ class TokenizerTesterMixin: def setUp(self) -> None: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, rust class, python class, vocab key name) + self.from_pretrained_id = ( + [self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id + ) + + self.tokenizers_list = [] if self.test_rust_tokenizer: self.tokenizers_list = [ ( self.rust_tokenizer_class, - self.from_pretrained_id, + pretrained_id, self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}, ) + for pretrained_id in self.from_pretrained_id ] else: self.tokenizers_list = [] @@ -1544,6 +1551,56 @@ def test_maximum_encoding_length_pair_input(self): self.assertEqual(len(overflowing_tokens), 2 + stride) self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :]) + @slow + @require_read_token + def test_encode_decode_fast_slow_all_tokens(self): + if self.rust_tokenizer_class is not None: + pretrained_name = self.from_pretrained_id + + slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False) + with self.subTest(f"{pretrained_name}"): + rust_tokenizer = self.rust_tokenizer_class.from_pretrained( + pretrained_name, from_slow=True, legacy=False + ) + input_full_vocab_ids = list( + range(len(slow_tokenizer)) + ) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations + input_full_vocab_string = rust_tokenizer.convert_tokens_to_string( + rust_tokenizer.convert_ids_to_tokens(input_full_vocab_ids) + ) + print(f"Length of the input string that is tested: {len(input_full_vocab_string)}") + + for chunk in range(0, len(input_full_vocab_string) - 1024, 1024): + string_to_check = input_full_vocab_string[chunk : chunk + 1024] + with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"): + slow_encode = slow_tokenizer.encode(string_to_check) + fast_encode = rust_tokenizer.encode(string_to_check) + self.assertEquals( + slow_encode, + fast_encode, + "Hint: the following tokenization diff were obtained for slow vs fast:\n " + f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n " + f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n" + f"string used : {string_to_check}", + ) + print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}") + for chunk in range(0, len(input_full_vocab_ids) - 100, 100): + ids_to_decode = input_full_vocab_ids[chunk : chunk + 100] + with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"): + self.assertEquals( + slow_tokenizer.decode( + ids_to_decode, + space_between_special_tokens=False, + clean_up_tokenization_spaces=False, + ), + rust_tokenizer.decode( + ids_to_decode, + space_between_special_tokens=False, + clean_up_tokenization_spaces=False, + ), + f"Hint here are the tokens being decoded.: {slow_tokenizer.convert_ids_to_tokens(ids_to_decode)}", + ) + # def test_encode_input_type(self): # tokenizers = self.get_tokenizers(do_lower_case=False) # for tokenizer in tokenizers: