Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GemmaConverter] use user_defined_symbols #29473

Merged
merged 17 commits into from
Mar 19, 2024
Merged
5 changes: 4 additions & 1 deletion src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1319,7 +1319,10 @@ def tokenizer(self, proto):
raise Exception(
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
)

user_defined_symbols = [
AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols
]
tokenizer.add_tokens(user_defined_symbols)
return tokenizer


Expand Down
2 changes: 1 addition & 1 deletion tests/models/llama/test_tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
@require_sentencepiece
@require_tokenizers
class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "hf-internal-testing/llama-tokenizer"
from_pretrained_id = ["hf-internal-testing/llama-tokenizer", "meta-llama/Llama-2-7b-hf"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good to me!

tokenizer_class = LlamaTokenizer
rust_tokenizer_class = LlamaTokenizerFast

Expand Down
58 changes: 57 additions & 1 deletion tests/test_tokenization_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
get_tests_dir,
is_pt_tf_cross_test,
require_jinja,
require_read_token,
require_tf,
require_tokenizers,
require_torch,
Expand Down Expand Up @@ -180,6 +181,7 @@ def check_subword_sampling(

class TokenizerTesterMixin:
tokenizer_class = None
pretrained_tokenizer_to_test = None
rust_tokenizer_class = None
test_slow_tokenizer = True
test_rust_tokenizer = True
Expand All @@ -200,13 +202,17 @@ class TokenizerTesterMixin:
def setUp(self) -> None:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name)
self.from_pretrained_id = list(self.from_pretrained_id)

self.tokenizers_list = []
if self.test_rust_tokenizer:
self.tokenizers_list = [
(
self.rust_tokenizer_class,
self.from_pretrained_id,
pretrained_id,
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
)
for pretrained_id in self.from_pretrained_id
]
else:
self.tokenizers_list = []
Expand Down Expand Up @@ -1544,6 +1550,56 @@ def test_maximum_encoding_length_pair_input(self):
self.assertEqual(len(overflowing_tokens), 2 + stride)
self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :])

@slow
@require_read_token
def test_encode_decode_fast_slow_all_tokens(self):
if self.rust_tokenizer_class is not None:
pretrained_name = self.from_pretrained_id

slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False)
with self.subTest(f"{pretrained_name}"):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(
pretrained_name, from_slow=True, legacy=False
)
input_full_vocab_ids = list(
range(len(slow_tokenizer))
) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
input_full_vocab_string = rust_tokenizer.convert_tokens_to_string(
rust_tokenizer.convert_ids_to_tokens(input_full_vocab_ids)
)
print(f"Length of the input string that is tested: {len(input_full_vocab_string)}")

for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
string_to_check = input_full_vocab_string[chunk : chunk + 1024]
with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
slow_encode = slow_tokenizer.encode(string_to_check)
fast_encode = rust_tokenizer.encode(string_to_check)
self.assertEquals(
slow_encode,
fast_encode,
"Hint: the following tokenization diff were obtained for slow vs fast:\n "
f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
f"string used : {string_to_check}",
)
print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
self.assertEquals(
slow_tokenizer.decode(
ids_to_decode,
space_between_special_tokens=False,
clean_up_tokenization_spaces=False,
),
rust_tokenizer.decode(
ids_to_decode,
space_between_special_tokens=False,
clean_up_tokenization_spaces=False,
),
f"Hint here are the tokens being decoded.: {slow_tokenizer.convert_ids_to_tokens(ids_to_decode)}",
)

# def test_encode_input_type(self):
# tokenizers = self.get_tokenizers(do_lower_case=False)
# for tokenizer in tokenizers:
Expand Down
Loading