From e7ea676f75b709fc0405975f9073120d367adfc4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 6 Mar 2024 10:23:34 +0900
Subject: [PATCH 01/15] use user_defined_symbols

---
 src/transformers/convert_slow_tokenizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 707bfae89db5..2edfd8258a1e 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -1319,7 +1319,8 @@ def tokenizer(self, proto):
             raise Exception(
                 "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
             )
-
+        user_defined_symbols = [AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols]
+        tokenizer.add_tokens(user_defined_symbols)
         return tokenizer
 
 

From 2b8461d1abe06844319fd0c0766c1208dd258afb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 6 Mar 2024 11:43:32 +0900
Subject: [PATCH 02/15] fixup

---
 src/transformers/convert_slow_tokenizer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 2edfd8258a1e..582b71f2a2e0 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -23,6 +23,7 @@
 from typing import Dict, List, Tuple
 
 from packaging import version
+
 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 
@@ -1319,7 +1320,9 @@ def tokenizer(self, proto):
             raise Exception(
                 "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
             )
-        user_defined_symbols = [AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols]
+        user_defined_symbols = [
+            AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols
+        ]
         tokenizer.add_tokens(user_defined_symbols)
         return tokenizer
 

From be54724631d7db398d41ffb46388e3168e9f8606 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 6 Mar 2024 11:43:37 +0900
Subject: [PATCH 03/15] nit

---
 src/transformers/convert_slow_tokenizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 582b71f2a2e0..9eed8cfb42c0 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -23,7 +23,6 @@
 from typing import Dict, List, Tuple
 
 from packaging import version
-
 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 

From ea4744fc767a4373653e25364323517b1001c40d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 18 Mar 2024 14:04:45 +1100
Subject: [PATCH 04/15] add a very robust test

---
 tests/models/gemma/test_tokenization_gemma.py |  2 +-
 tests/models/llama/test_tokenization_llama.py |  1 +
 tests/models/t5/test_tokenization_t5.py       |  1 +
 tests/test_tokenization_common.py             | 60 ++++++++++++++++++-
 4 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py
index a16d471a24b8..a753854c31bc 100644
--- a/tests/models/gemma/test_tokenization_gemma.py
+++ b/tests/models/gemma/test_tokenization_gemma.py
@@ -51,7 +51,7 @@
 class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = GemmaTokenizer
     rust_tokenizer_class = GemmaTokenizerFast
-
+    pretrained_tokenizers_to_test = [(GemmaTokenizer, "google/gemma-2b", None)]
     test_rust_tokenizer = False
     test_sentencepiece = True
     from_pretrained_kwargs = {}
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index f3674a83b085..0247ef701d51 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -54,6 +54,7 @@
 class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = LlamaTokenizer
     rust_tokenizer_class = LlamaTokenizerFast
+    pretrained_tokenizers_to_test = [(LlamaTokenizer, "meta-llama/Llama-2-7b-hf", None)]
 
     test_rust_tokenizer = False
     test_sentencepiece = True
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index b0755dc1ba00..1135dd247b6e 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -40,6 +40,7 @@
 class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = T5Tokenizer
     rust_tokenizer_class = T5TokenizerFast
+    pretrained_tokenizers_to_test = [(T5Tokenizer, "google-t5/t5-base", None)]
     test_rust_tokenizer = True
     test_sentencepiece = True
 
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index d0c587491144..26eb47cdda18 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -51,6 +51,7 @@
     get_tests_dir,
     is_pt_tf_cross_test,
     require_jinja,
+    require_read_token,
     require_tf,
     require_tokenizers,
     require_torch,
@@ -180,6 +181,7 @@ def check_subword_sampling(
 
 class TokenizerTesterMixin:
     tokenizer_class = None
+    pretrained_tokenizer_to_test = None
     rust_tokenizer_class = None
     test_slow_tokenizer = True
     test_rust_tokenizer = True
@@ -214,7 +216,9 @@ def setUp(self) -> None:
             ]
             self.tokenizers_list = tokenizers_list[:1]  # Let's just test the first pretrained vocab for speed
         else:
-            self.tokenizers_list = []
+            self.tokenizers_list = (
+                [] if self.pretrained_tokenizer_to_test is None else self.pretrained_tokenizer_to_test
+            )
         with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
             self._data = f_data.read().replace("\n\n", "\n").strip()
 
@@ -1503,6 +1507,60 @@ def test_maximum_encoding_length_pair_input(self):
                     self.assertEqual(len(overflowing_tokens), 2 + stride)
                     self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :])
 
+    @slow
+    @require_read_token
+    def test_encode_decode_fast_slow_all_tokens(self):
+        if self.rust_tokenizer_class is not None:
+            if len(self.pretrained_tokenizers_to_test) < 0:
+                raise ValueError(
+                    "You have to define a `pretrained_tokenizers_to_test` attribute to the {self.class} to make sure all functionalities are properly tested."
+                )
+            for slow_tokenizer, pretrained_name, kwargs in self.pretrained_tokenizers_to_test:
+                print(pretrained_name)
+                slow_tokenizer = slow_tokenizer.from_pretrained(pretrained_name, legacy=False)
+                with self.subTest(f"{pretrained_name}"):
+                    rust_tokenizer = self.rust_tokenizer_class.from_pretrained(
+                        pretrained_name, from_slow=True, legacy=False
+                    )
+                    input_full_vocab_ids = list(
+                        range(len(slow_tokenizer))
+                    )  # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
+                    input_full_vocab_string = rust_tokenizer.convert_tokens_to_string(
+                        rust_tokenizer.convert_ids_to_tokens(input_full_vocab_ids)
+                    )
+                    print(f"Length of the input string that is tested: {len(input_full_vocab_string)}")
+
+                    for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
+                        string_to_check = input_full_vocab_string[chunk : chunk + 1024]
+                        with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
+                            slow_encode = slow_tokenizer.encode(string_to_check)
+                            fast_encode = rust_tokenizer.encode(string_to_check)
+                            self.assertEquals(
+                                slow_encode,
+                                fast_encode,
+                                "Hint: the following tokenization diff were obtained for slow vs fast:\n "
+                                f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
+                                f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
+                                f"string used     : {string_to_check}",
+                            )
+                    print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
+                    for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
+                        ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
+                        with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
+                            self.assertEquals(
+                                slow_tokenizer.decode(
+                                    ids_to_decode,
+                                    space_between_special_tokens=False,
+                                    clean_up_tokenization_spaces=False,
+                                ),
+                                rust_tokenizer.decode(
+                                    ids_to_decode,
+                                    space_between_special_tokens=False,
+                                    clean_up_tokenization_spaces=False,
+                                ),
+                                f"Hint here are the tokens being decoded.: {slow_tokenizer.convert_ids_to_tokens(ids_to_decode)}",
+                            )
+
     # def test_encode_input_type(self):
     #     tokenizers = self.get_tokenizers(do_lower_case=False)
     #     for tokenizer in tokenizers:

From 0b51d3525f4ad302eb38364c8986fa157623ea92 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 18 Mar 2024 16:43:53 +1100
Subject: [PATCH 05/15] make sure all models are tested with the
 `pretrained_tokenizer_to_test`

---
 tests/test_tokenization_common.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 26eb47cdda18..ded305d0dc96 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -201,6 +201,7 @@ class TokenizerTesterMixin:
     def setUp(self) -> None:
         # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
         # information available in Tokenizer (name, rust class, python class, vocab key name)
+        self.tokenizers_list = []
         if self.test_rust_tokenizer:
             tokenizers_list = [
                 (
@@ -215,10 +216,10 @@ def setUp(self) -> None:
                 or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name))
             ]
             self.tokenizers_list = tokenizers_list[:1]  # Let's just test the first pretrained vocab for speed
-        else:
-            self.tokenizers_list = (
-                [] if self.pretrained_tokenizer_to_test is None else self.pretrained_tokenizer_to_test
-            )
+            
+        if self.pretrained_tokenizer_to_test is not None:
+            self.tokenizers_list += self.pretrained_tokenizer_to_test 
+
         with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
             self._data = f_data.read().replace("\n\n", "\n").strip()
 

From 0c023301ccd1fae40a2acac702afe2689c72b111 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 18 Mar 2024 16:49:49 +1100
Subject: [PATCH 06/15] should we make sure we test all of them?

---
 tests/test_tokenization_common.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index ded305d0dc96..4b077c52249e 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -393,6 +393,10 @@ def convert_batch_encode_plus_format_to_encode_plus(batch_encode_plus_sequences)
             for i in range(len(batch_encode_plus_sequences["input_ids"]))
         ]
 
+    def test_pretrained_tokenizer_is_fully_tested(self):
+        if self.pretrained_tokenizer_to_test is None:
+            raise ValueError("This tokenizer test does not define a `pretrained_tokenizer_to_test`. This is now required. Make sure to add one.")
+
     # TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers.
     def test_tokenize_special_tokens(self):
         """Test `tokenize` with special tokens."""

From 6bae13085c2748dc32bb89299d11f508a8b57319 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 19 Mar 2024 18:24:59 +0900
Subject: [PATCH 07/15] merge

---
 tests/models/gemma/test_tokenization_gemma.py |  2 +-
 tests/models/llama/test_tokenization_llama.py |  3 +-
 tests/models/t5/test_tokenization_t5.py       |  1 -
 tests/test_tokenization_common.py             | 94 +++++++++----------
 4 files changed, 47 insertions(+), 53 deletions(-)

diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py
index 122069ab6c92..5e485da491f8 100644
--- a/tests/models/gemma/test_tokenization_gemma.py
+++ b/tests/models/gemma/test_tokenization_gemma.py
@@ -52,7 +52,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "google/gemma-7b"
     tokenizer_class = GemmaTokenizer
     rust_tokenizer_class = GemmaTokenizerFast
-    pretrained_tokenizers_to_test = [(GemmaTokenizer, "google/gemma-2b", None)]
+
     test_rust_tokenizer = False
     test_sentencepiece = True
     from_pretrained_kwargs = {}
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index c34f1407a96d..0cee3347c408 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -52,10 +52,9 @@
 @require_sentencepiece
 @require_tokenizers
 class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "hf-internal-testing/llama-tokenizer"
+    from_pretrained_id = ["hf-internal-testing/llama-tokenizer", "meta-llama/Llama-2-7b-hf"]
     tokenizer_class = LlamaTokenizer
     rust_tokenizer_class = LlamaTokenizerFast
-    pretrained_tokenizers_to_test = [(LlamaTokenizer, "meta-llama/Llama-2-7b-hf", None)]
 
     test_rust_tokenizer = False
     test_sentencepiece = True
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index d4b516b65811..388388ff2388 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -41,7 +41,6 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "google-t5/t5-small"
     tokenizer_class = T5Tokenizer
     rust_tokenizer_class = T5TokenizerFast
-    pretrained_tokenizers_to_test = [(T5Tokenizer, "google-t5/t5-base", None)]
     test_rust_tokenizer = True
     test_sentencepiece = True
 
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 69b747952d5c..c9640d7474f2 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -1555,55 +1555,51 @@ def test_maximum_encoding_length_pair_input(self):
     @require_read_token
     def test_encode_decode_fast_slow_all_tokens(self):
         if self.rust_tokenizer_class is not None:
-            if len(self.pretrained_tokenizers_to_test) < 0:
-                raise ValueError(
-                    "You have to define a `pretrained_tokenizers_to_test` attribute to the {self.class} to make sure all functionalities are properly tested."
-                )
-            for slow_tokenizer, pretrained_name, kwargs in self.pretrained_tokenizers_to_test:
-                print(pretrained_name)
-                slow_tokenizer = slow_tokenizer.from_pretrained(pretrained_name, legacy=False)
-                with self.subTest(f"{pretrained_name}"):
-                    rust_tokenizer = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, from_slow=True, legacy=False
-                    )
-                    input_full_vocab_ids = list(
-                        range(len(slow_tokenizer))
-                    )  # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
-                    input_full_vocab_string = rust_tokenizer.convert_tokens_to_string(
-                        rust_tokenizer.convert_ids_to_tokens(input_full_vocab_ids)
-                    )
-                    print(f"Length of the input string that is tested: {len(input_full_vocab_string)}")
-
-                    for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
-                        string_to_check = input_full_vocab_string[chunk : chunk + 1024]
-                        with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
-                            slow_encode = slow_tokenizer.encode(string_to_check)
-                            fast_encode = rust_tokenizer.encode(string_to_check)
-                            self.assertEquals(
-                                slow_encode,
-                                fast_encode,
-                                "Hint: the following tokenization diff were obtained for slow vs fast:\n "
-                                f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
-                                f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
-                                f"string used     : {string_to_check}",
-                            )
-                    print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
-                    for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
-                        ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
-                        with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
-                            self.assertEquals(
-                                slow_tokenizer.decode(
-                                    ids_to_decode,
-                                    space_between_special_tokens=False,
-                                    clean_up_tokenization_spaces=False,
-                                ),
-                                rust_tokenizer.decode(
-                                    ids_to_decode,
-                                    space_between_special_tokens=False,
-                                    clean_up_tokenization_spaces=False,
-                                ),
-                                f"Hint here are the tokens being decoded.: {slow_tokenizer.convert_ids_to_tokens(ids_to_decode)}",
-                            )
+            pretrained_name = self.from_pretrained_id
+
+            slow_tokenizer = slow_tokenizer.from_pretrained(pretrained_name, legacy=False)
+            with self.subTest(f"{pretrained_name}"):
+                rust_tokenizer = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name, from_slow=True, legacy=False
+                )
+                input_full_vocab_ids = list(
+                    range(len(slow_tokenizer))
+                )  # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
+                input_full_vocab_string = rust_tokenizer.convert_tokens_to_string(
+                    rust_tokenizer.convert_ids_to_tokens(input_full_vocab_ids)
+                )
+                print(f"Length of the input string that is tested: {len(input_full_vocab_string)}")
+
+                for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
+                    string_to_check = input_full_vocab_string[chunk : chunk + 1024]
+                    with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
+                        slow_encode = slow_tokenizer.encode(string_to_check)
+                        fast_encode = rust_tokenizer.encode(string_to_check)
+                        self.assertEquals(
+                            slow_encode,
+                            fast_encode,
+                            "Hint: the following tokenization diff were obtained for slow vs fast:\n "
+                            f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
+                            f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
+                            f"string used     : {string_to_check}",
+                        )
+                print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
+                for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
+                    ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
+                    with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
+                        self.assertEquals(
+                            slow_tokenizer.decode(
+                                ids_to_decode,
+                                space_between_special_tokens=False,
+                                clean_up_tokenization_spaces=False,
+                            ),
+                            rust_tokenizer.decode(
+                                ids_to_decode,
+                                space_between_special_tokens=False,
+                                clean_up_tokenization_spaces=False,
+                            ),
+                            f"Hint here are the tokens being decoded.: {slow_tokenizer.convert_ids_to_tokens(ids_to_decode)}",
+                        )
 
     # def test_encode_input_type(self):
     #     tokenizers = self.get_tokenizers(do_lower_case=False)

From 66176758a164c31fdca2b6ab7f177aff2b99c2c6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 19 Mar 2024 18:28:36 +0900
Subject: [PATCH 08/15] remove the id

---
 tests/test_tokenization_common.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index c9640d7474f2..8c648f1f1aaa 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -386,9 +386,6 @@ def convert_batch_encode_plus_format_to_encode_plus(batch_encode_plus_sequences)
             for i in range(len(batch_encode_plus_sequences["input_ids"]))
         ]
 
-    def test_pretrained_tokenizer_is_fully_tested(self):
-        if self.pretrained_tokenizer_to_test is None:
-            raise ValueError("This tokenizer test does not define a `pretrained_tokenizer_to_test`. This is now required. Make sure to add one.")
 
     # TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers.
     def test_tokenize_special_tokens(self):

From 9f174effea3e929487b8ecd5152c0edf6e6828d6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 19 Mar 2024 18:30:21 +0900
Subject: [PATCH 09/15] fix test

---
 tests/test_tokenization_common.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 8c648f1f1aaa..3e0cf7e05638 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -386,7 +386,6 @@ def convert_batch_encode_plus_format_to_encode_plus(batch_encode_plus_sequences)
             for i in range(len(batch_encode_plus_sequences["input_ids"]))
         ]
 
-
     # TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers.
     def test_tokenize_special_tokens(self):
         """Test `tokenize` with special tokens."""
@@ -1554,7 +1553,7 @@ def test_encode_decode_fast_slow_all_tokens(self):
         if self.rust_tokenizer_class is not None:
             pretrained_name = self.from_pretrained_id
 
-            slow_tokenizer = slow_tokenizer.from_pretrained(pretrained_name, legacy=False)
+            slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False)
             with self.subTest(f"{pretrained_name}"):
                 rust_tokenizer = self.rust_tokenizer_class.from_pretrained(
                     pretrained_name, from_slow=True, legacy=False

From 31e8a7c9ad574d6a5141326a51bd6a803893eeba Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 19 Mar 2024 18:45:30 +0900
Subject: [PATCH 10/15] update

---
 tests/test_tokenization_common.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 3e0cf7e05638..e6cd1193da08 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -202,14 +202,17 @@ class TokenizerTesterMixin:
     def setUp(self) -> None:
         # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
         # information available in Tokenizer (name, rust class, python class, vocab key name)
+        self.from_pretrained_id = list(self.from_pretrained_id)
+
         self.tokenizers_list = []
         if self.test_rust_tokenizer:
             self.tokenizers_list = [
                 (
                     self.rust_tokenizer_class,
-                    self.from_pretrained_id,
+                    pretrained_id,
                     self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
                 )
+                for pretrained_id in self.from_pretrained_id
             ]
         else:
             self.tokenizers_list = []

From 9d0f0c0c8da0c8d396eae14b5ba3d4d94fa19eaf Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 19 Mar 2024 19:11:38 +0900
Subject: [PATCH 11/15] ousies

---
 src/transformers/convert_slow_tokenizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 9eed8cfb42c0..582b71f2a2e0 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -23,6 +23,7 @@
 from typing import Dict, List, Tuple
 
 from packaging import version
+
 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 

From aa3f2b2ec0510203f5bbefeca7ac5788732ea724 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 19 Mar 2024 19:15:26 +0900
Subject: [PATCH 12/15] oups

---
 tests/test_tokenization_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index e6cd1193da08..488dbe909a10 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -202,7 +202,7 @@ class TokenizerTesterMixin:
     def setUp(self) -> None:
         # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
         # information available in Tokenizer (name, rust class, python class, vocab key name)
-        self.from_pretrained_id = list(self.from_pretrained_id)
+        self.from_pretrained_id = [self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id
 
         self.tokenizers_list = []
         if self.test_rust_tokenizer:

From ad794806ab6f8c7b451c298d44cbdedc67565c25 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 19 Mar 2024 19:18:28 +0900
Subject: [PATCH 13/15] fixup

---
 tests/test_tokenization_common.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 488dbe909a10..35e730b472ef 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -202,7 +202,9 @@ class TokenizerTesterMixin:
     def setUp(self) -> None:
         # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
         # information available in Tokenizer (name, rust class, python class, vocab key name)
-        self.from_pretrained_id = [self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id
+        self.from_pretrained_id = (
+            [self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id
+        )
 
         self.tokenizers_list = []
         if self.test_rust_tokenizer:

From 1327998597d011a07c89be8007ff20cbd23edeb7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 19 Mar 2024 21:54:36 +0900
Subject: [PATCH 14/15] fix copies check

---
 src/transformers/convert_slow_tokenizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 582b71f2a2e0..9eed8cfb42c0 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -23,7 +23,6 @@
 from typing import Dict, List, Tuple
 
 from packaging import version
-
 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 

From d1b9bf7f1994a3f5a6b1240f8ceb78fb75405c65 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 19 Mar 2024 22:19:13 +0900
Subject: [PATCH 15/15] remove `pretrained_tokenizer_to_test`

---
 tests/test_tokenization_common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 35e730b472ef..fa1be251e0d8 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -181,7 +181,6 @@ def check_subword_sampling(
 
 class TokenizerTesterMixin:
     tokenizer_class = None
-    pretrained_tokenizer_to_test = None
     rust_tokenizer_class = None
     test_slow_tokenizer = True
     test_rust_tokenizer = True