From e7ea676f75b709fc0405975f9073120d367adfc4 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 6 Mar 2024 10:23:34 +0900 Subject: [PATCH 01/15] use user_defined_symbols --- src/transformers/convert_slow_tokenizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 707bfae89db5..2edfd8258a1e 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -1319,7 +1319,8 @@ def tokenizer(self, proto): raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) - + user_defined_symbols = [AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols] + tokenizer.add_tokens(user_defined_symbols) return tokenizer From 2b8461d1abe06844319fd0c0766c1208dd258afb Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 6 Mar 2024 11:43:32 +0900 Subject: [PATCH 02/15] fixup --- src/transformers/convert_slow_tokenizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 2edfd8258a1e..582b71f2a2e0 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -23,6 +23,7 @@ from typing import Dict, List, Tuple from packaging import version + from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece @@ -1319,7 +1320,9 @@ def tokenizer(self, proto): raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) - user_defined_symbols = [AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols] + user_defined_symbols = [ + AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols + ] tokenizer.add_tokens(user_defined_symbols) return tokenizer From be54724631d7db398d41ffb46388e3168e9f8606 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 6 Mar 2024 11:43:37 +0900 Subject: [PATCH 03/15] nit --- src/transformers/convert_slow_tokenizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 582b71f2a2e0..9eed8cfb42c0 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -23,7 +23,6 @@ from typing import Dict, List, Tuple from packaging import version - from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece From ea4744fc767a4373653e25364323517b1001c40d Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 18 Mar 2024 14:04:45 +1100 Subject: [PATCH 04/15] add a very robust test --- tests/models/gemma/test_tokenization_gemma.py | 2 +- tests/models/llama/test_tokenization_llama.py | 1 + tests/models/t5/test_tokenization_t5.py | 1 + tests/test_tokenization_common.py | 60 ++++++++++++++++++- 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py index a16d471a24b8..a753854c31bc 100644 --- a/tests/models/gemma/test_tokenization_gemma.py +++ b/tests/models/gemma/test_tokenization_gemma.py @@ -51,7 +51,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = GemmaTokenizer rust_tokenizer_class = GemmaTokenizerFast - + pretrained_tokenizers_to_test = [(GemmaTokenizer, "google/gemma-2b", None)] test_rust_tokenizer = False test_sentencepiece = True from_pretrained_kwargs = {} diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index f3674a83b085..0247ef701d51 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -54,6 +54,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = LlamaTokenizer rust_tokenizer_class = LlamaTokenizerFast + pretrained_tokenizers_to_test = [(LlamaTokenizer, "meta-llama/Llama-2-7b-hf", None)] test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index b0755dc1ba00..1135dd247b6e 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -40,6 +40,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = T5Tokenizer rust_tokenizer_class = T5TokenizerFast + pretrained_tokenizers_to_test = [(T5Tokenizer, "google-t5/t5-base", None)] test_rust_tokenizer = True test_sentencepiece = True diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index d0c587491144..26eb47cdda18 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -51,6 +51,7 @@ get_tests_dir, is_pt_tf_cross_test, require_jinja, + require_read_token, require_tf, require_tokenizers, require_torch, @@ -180,6 +181,7 @@ def check_subword_sampling( class TokenizerTesterMixin: tokenizer_class = None + pretrained_tokenizer_to_test = None rust_tokenizer_class = None test_slow_tokenizer = True test_rust_tokenizer = True @@ -214,7 +216,9 @@ def setUp(self) -> None: ] self.tokenizers_list = tokenizers_list[:1] # Let's just test the first pretrained vocab for speed else: - self.tokenizers_list = [] + self.tokenizers_list = ( + [] if self.pretrained_tokenizer_to_test is None else self.pretrained_tokenizer_to_test + ) with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: self._data = f_data.read().replace("\n\n", "\n").strip() @@ -1503,6 +1507,60 @@ def test_maximum_encoding_length_pair_input(self): self.assertEqual(len(overflowing_tokens), 2 + stride) self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :]) + @slow + @require_read_token + def test_encode_decode_fast_slow_all_tokens(self): + if self.rust_tokenizer_class is not None: + if len(self.pretrained_tokenizers_to_test) < 0: + raise ValueError( + "You have to define a `pretrained_tokenizers_to_test` attribute to the {self.class} to make sure all functionalities are properly tested." + ) + for slow_tokenizer, pretrained_name, kwargs in self.pretrained_tokenizers_to_test: + print(pretrained_name) + slow_tokenizer = slow_tokenizer.from_pretrained(pretrained_name, legacy=False) + with self.subTest(f"{pretrained_name}"): + rust_tokenizer = self.rust_tokenizer_class.from_pretrained( + pretrained_name, from_slow=True, legacy=False + ) + input_full_vocab_ids = list( + range(len(slow_tokenizer)) + ) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations + input_full_vocab_string = rust_tokenizer.convert_tokens_to_string( + rust_tokenizer.convert_ids_to_tokens(input_full_vocab_ids) + ) + print(f"Length of the input string that is tested: {len(input_full_vocab_string)}") + + for chunk in range(0, len(input_full_vocab_string) - 1024, 1024): + string_to_check = input_full_vocab_string[chunk : chunk + 1024] + with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"): + slow_encode = slow_tokenizer.encode(string_to_check) + fast_encode = rust_tokenizer.encode(string_to_check) + self.assertEquals( + slow_encode, + fast_encode, + "Hint: the following tokenization diff were obtained for slow vs fast:\n " + f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n " + f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n" + f"string used : {string_to_check}", + ) + print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}") + for chunk in range(0, len(input_full_vocab_ids) - 100, 100): + ids_to_decode = input_full_vocab_ids[chunk : chunk + 100] + with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"): + self.assertEquals( + slow_tokenizer.decode( + ids_to_decode, + space_between_special_tokens=False, + clean_up_tokenization_spaces=False, + ), + rust_tokenizer.decode( + ids_to_decode, + space_between_special_tokens=False, + clean_up_tokenization_spaces=False, + ), + f"Hint here are the tokens being decoded.: {slow_tokenizer.convert_ids_to_tokens(ids_to_decode)}", + ) + # def test_encode_input_type(self): # tokenizers = self.get_tokenizers(do_lower_case=False) # for tokenizer in tokenizers: From 0b51d3525f4ad302eb38364c8986fa157623ea92 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 18 Mar 2024 16:43:53 +1100 Subject: [PATCH 05/15] make sure all models are tested with the `pretrained_tokenizer_to_test` --- tests/test_tokenization_common.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 26eb47cdda18..ded305d0dc96 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -201,6 +201,7 @@ class TokenizerTesterMixin: def setUp(self) -> None: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, rust class, python class, vocab key name) + self.tokenizers_list = [] if self.test_rust_tokenizer: tokenizers_list = [ ( @@ -215,10 +216,10 @@ def setUp(self) -> None: or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name)) ] self.tokenizers_list = tokenizers_list[:1] # Let's just test the first pretrained vocab for speed - else: - self.tokenizers_list = ( - [] if self.pretrained_tokenizer_to_test is None else self.pretrained_tokenizer_to_test - ) + + if self.pretrained_tokenizer_to_test is not None: + self.tokenizers_list += self.pretrained_tokenizer_to_test + with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: self._data = f_data.read().replace("\n\n", "\n").strip() From 0c023301ccd1fae40a2acac702afe2689c72b111 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 18 Mar 2024 16:49:49 +1100 Subject: [PATCH 06/15] should we make sure we test all of them? --- tests/test_tokenization_common.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index ded305d0dc96..4b077c52249e 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -393,6 +393,10 @@ def convert_batch_encode_plus_format_to_encode_plus(batch_encode_plus_sequences) for i in range(len(batch_encode_plus_sequences["input_ids"])) ] + def test_pretrained_tokenizer_is_fully_tested(self): + if self.pretrained_tokenizer_to_test is None: + raise ValueError("This tokenizer test does not define a `pretrained_tokenizer_to_test`. This is now required. Make sure to add one.") + # TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers. def test_tokenize_special_tokens(self): """Test `tokenize` with special tokens.""" From 6bae13085c2748dc32bb89299d11f508a8b57319 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 19 Mar 2024 18:24:59 +0900 Subject: [PATCH 07/15] merge --- tests/models/gemma/test_tokenization_gemma.py | 2 +- tests/models/llama/test_tokenization_llama.py | 3 +- tests/models/t5/test_tokenization_t5.py | 1 - tests/test_tokenization_common.py | 94 +++++++++---------- 4 files changed, 47 insertions(+), 53 deletions(-) diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py index 122069ab6c92..5e485da491f8 100644 --- a/tests/models/gemma/test_tokenization_gemma.py +++ b/tests/models/gemma/test_tokenization_gemma.py @@ -52,7 +52,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_id = "google/gemma-7b" tokenizer_class = GemmaTokenizer rust_tokenizer_class = GemmaTokenizerFast - pretrained_tokenizers_to_test = [(GemmaTokenizer, "google/gemma-2b", None)] + test_rust_tokenizer = False test_sentencepiece = True from_pretrained_kwargs = {} diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index c34f1407a96d..0cee3347c408 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -52,10 +52,9 @@ @require_sentencepiece @require_tokenizers class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - from_pretrained_id = "hf-internal-testing/llama-tokenizer" + from_pretrained_id = ["hf-internal-testing/llama-tokenizer", "meta-llama/Llama-2-7b-hf"] tokenizer_class = LlamaTokenizer rust_tokenizer_class = LlamaTokenizerFast - pretrained_tokenizers_to_test = [(LlamaTokenizer, "meta-llama/Llama-2-7b-hf", None)] test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index d4b516b65811..388388ff2388 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -41,7 +41,6 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_id = "google-t5/t5-small" tokenizer_class = T5Tokenizer rust_tokenizer_class = T5TokenizerFast - pretrained_tokenizers_to_test = [(T5Tokenizer, "google-t5/t5-base", None)] test_rust_tokenizer = True test_sentencepiece = True diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 69b747952d5c..c9640d7474f2 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1555,55 +1555,51 @@ def test_maximum_encoding_length_pair_input(self): @require_read_token def test_encode_decode_fast_slow_all_tokens(self): if self.rust_tokenizer_class is not None: - if len(self.pretrained_tokenizers_to_test) < 0: - raise ValueError( - "You have to define a `pretrained_tokenizers_to_test` attribute to the {self.class} to make sure all functionalities are properly tested." - ) - for slow_tokenizer, pretrained_name, kwargs in self.pretrained_tokenizers_to_test: - print(pretrained_name) - slow_tokenizer = slow_tokenizer.from_pretrained(pretrained_name, legacy=False) - with self.subTest(f"{pretrained_name}"): - rust_tokenizer = self.rust_tokenizer_class.from_pretrained( - pretrained_name, from_slow=True, legacy=False - ) - input_full_vocab_ids = list( - range(len(slow_tokenizer)) - ) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations - input_full_vocab_string = rust_tokenizer.convert_tokens_to_string( - rust_tokenizer.convert_ids_to_tokens(input_full_vocab_ids) - ) - print(f"Length of the input string that is tested: {len(input_full_vocab_string)}") - - for chunk in range(0, len(input_full_vocab_string) - 1024, 1024): - string_to_check = input_full_vocab_string[chunk : chunk + 1024] - with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"): - slow_encode = slow_tokenizer.encode(string_to_check) - fast_encode = rust_tokenizer.encode(string_to_check) - self.assertEquals( - slow_encode, - fast_encode, - "Hint: the following tokenization diff were obtained for slow vs fast:\n " - f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n " - f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n" - f"string used : {string_to_check}", - ) - print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}") - for chunk in range(0, len(input_full_vocab_ids) - 100, 100): - ids_to_decode = input_full_vocab_ids[chunk : chunk + 100] - with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"): - self.assertEquals( - slow_tokenizer.decode( - ids_to_decode, - space_between_special_tokens=False, - clean_up_tokenization_spaces=False, - ), - rust_tokenizer.decode( - ids_to_decode, - space_between_special_tokens=False, - clean_up_tokenization_spaces=False, - ), - f"Hint here are the tokens being decoded.: {slow_tokenizer.convert_ids_to_tokens(ids_to_decode)}", - ) + pretrained_name = self.from_pretrained_id + + slow_tokenizer = slow_tokenizer.from_pretrained(pretrained_name, legacy=False) + with self.subTest(f"{pretrained_name}"): + rust_tokenizer = self.rust_tokenizer_class.from_pretrained( + pretrained_name, from_slow=True, legacy=False + ) + input_full_vocab_ids = list( + range(len(slow_tokenizer)) + ) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations + input_full_vocab_string = rust_tokenizer.convert_tokens_to_string( + rust_tokenizer.convert_ids_to_tokens(input_full_vocab_ids) + ) + print(f"Length of the input string that is tested: {len(input_full_vocab_string)}") + + for chunk in range(0, len(input_full_vocab_string) - 1024, 1024): + string_to_check = input_full_vocab_string[chunk : chunk + 1024] + with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"): + slow_encode = slow_tokenizer.encode(string_to_check) + fast_encode = rust_tokenizer.encode(string_to_check) + self.assertEquals( + slow_encode, + fast_encode, + "Hint: the following tokenization diff were obtained for slow vs fast:\n " + f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n " + f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n" + f"string used : {string_to_check}", + ) + print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}") + for chunk in range(0, len(input_full_vocab_ids) - 100, 100): + ids_to_decode = input_full_vocab_ids[chunk : chunk + 100] + with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"): + self.assertEquals( + slow_tokenizer.decode( + ids_to_decode, + space_between_special_tokens=False, + clean_up_tokenization_spaces=False, + ), + rust_tokenizer.decode( + ids_to_decode, + space_between_special_tokens=False, + clean_up_tokenization_spaces=False, + ), + f"Hint here are the tokens being decoded.: {slow_tokenizer.convert_ids_to_tokens(ids_to_decode)}", + ) # def test_encode_input_type(self): # tokenizers = self.get_tokenizers(do_lower_case=False) From 66176758a164c31fdca2b6ab7f177aff2b99c2c6 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 19 Mar 2024 18:28:36 +0900 Subject: [PATCH 08/15] remove the id --- tests/test_tokenization_common.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index c9640d7474f2..8c648f1f1aaa 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -386,9 +386,6 @@ def convert_batch_encode_plus_format_to_encode_plus(batch_encode_plus_sequences) for i in range(len(batch_encode_plus_sequences["input_ids"])) ] - def test_pretrained_tokenizer_is_fully_tested(self): - if self.pretrained_tokenizer_to_test is None: - raise ValueError("This tokenizer test does not define a `pretrained_tokenizer_to_test`. This is now required. Make sure to add one.") # TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers. def test_tokenize_special_tokens(self): From 9f174effea3e929487b8ecd5152c0edf6e6828d6 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 19 Mar 2024 18:30:21 +0900 Subject: [PATCH 09/15] fix test --- tests/test_tokenization_common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 8c648f1f1aaa..3e0cf7e05638 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -386,7 +386,6 @@ def convert_batch_encode_plus_format_to_encode_plus(batch_encode_plus_sequences) for i in range(len(batch_encode_plus_sequences["input_ids"])) ] - # TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers. def test_tokenize_special_tokens(self): """Test `tokenize` with special tokens.""" @@ -1554,7 +1553,7 @@ def test_encode_decode_fast_slow_all_tokens(self): if self.rust_tokenizer_class is not None: pretrained_name = self.from_pretrained_id - slow_tokenizer = slow_tokenizer.from_pretrained(pretrained_name, legacy=False) + slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False) with self.subTest(f"{pretrained_name}"): rust_tokenizer = self.rust_tokenizer_class.from_pretrained( pretrained_name, from_slow=True, legacy=False From 31e8a7c9ad574d6a5141326a51bd6a803893eeba Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 19 Mar 2024 18:45:30 +0900 Subject: [PATCH 10/15] update --- tests/test_tokenization_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 3e0cf7e05638..e6cd1193da08 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -202,14 +202,17 @@ class TokenizerTesterMixin: def setUp(self) -> None: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, rust class, python class, vocab key name) + self.from_pretrained_id = list(self.from_pretrained_id) + self.tokenizers_list = [] if self.test_rust_tokenizer: self.tokenizers_list = [ ( self.rust_tokenizer_class, - self.from_pretrained_id, + pretrained_id, self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}, ) + for pretrained_id in self.from_pretrained_id ] else: self.tokenizers_list = [] From 9d0f0c0c8da0c8d396eae14b5ba3d4d94fa19eaf Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 19 Mar 2024 19:11:38 +0900 Subject: [PATCH 11/15] ousies --- src/transformers/convert_slow_tokenizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 9eed8cfb42c0..582b71f2a2e0 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -23,6 +23,7 @@ from typing import Dict, List, Tuple from packaging import version + from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece From aa3f2b2ec0510203f5bbefeca7ac5788732ea724 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 19 Mar 2024 19:15:26 +0900 Subject: [PATCH 12/15] oups --- tests/test_tokenization_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index e6cd1193da08..488dbe909a10 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -202,7 +202,7 @@ class TokenizerTesterMixin: def setUp(self) -> None: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, rust class, python class, vocab key name) - self.from_pretrained_id = list(self.from_pretrained_id) + self.from_pretrained_id = [self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id self.tokenizers_list = [] if self.test_rust_tokenizer: From ad794806ab6f8c7b451c298d44cbdedc67565c25 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 19 Mar 2024 19:18:28 +0900 Subject: [PATCH 13/15] fixup --- tests/test_tokenization_common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 488dbe909a10..35e730b472ef 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -202,7 +202,9 @@ class TokenizerTesterMixin: def setUp(self) -> None: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, rust class, python class, vocab key name) - self.from_pretrained_id = [self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id + self.from_pretrained_id = ( + [self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id + ) self.tokenizers_list = [] if self.test_rust_tokenizer: From 1327998597d011a07c89be8007ff20cbd23edeb7 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 19 Mar 2024 21:54:36 +0900 Subject: [PATCH 14/15] fix copies check --- src/transformers/convert_slow_tokenizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 582b71f2a2e0..9eed8cfb42c0 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -23,7 +23,6 @@ from typing import Dict, List, Tuple from packaging import version - from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece From d1b9bf7f1994a3f5a6b1240f8ceb78fb75405c65 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 19 Mar 2024 22:19:13 +0900 Subject: [PATCH 15/15] remove `pretrained_tokenizer_to_test` --- tests/test_tokenization_common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 35e730b472ef..fa1be251e0d8 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -181,7 +181,6 @@ def check_subword_sampling( class TokenizerTesterMixin: tokenizer_class = None - pretrained_tokenizer_to_test = None rust_tokenizer_class = None test_slow_tokenizer = True test_rust_tokenizer = True