Optimize t5 tokenize logic to avoid redundant calls

huggingface · Jul 28, 2024 · d694042 · d694042
1 parent 44f6fdd
commit d694042
Showing 1 changed file with 1 addition and 2 deletions.
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
@@ -389,9 +389,8 @@ def _tokenize(self, text, **kwargs):
         `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
         `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
         """
-        tokens = self.sp_model.encode(text, out_type=str)
         if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
-            return tokens
+            return self.sp_model.encode(text, out_type=str)
 
         # 1. Encode string + prefix ex: "<unk> Hey"
         tokens = self.sp_model.encode(self.unk_token + text, out_type=str)