diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 0f2ae101c8f7..1e166a78f10d 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -389,9 +389,8 @@ def _tokenize(self, text, **kwargs): `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`. `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`. """ - tokens = self.sp_model.encode(text, out_type=str) if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")): - return tokens + return self.sp_model.encode(text, out_type=str) # 1. Encode string + prefix ex: " Hey" tokens = self.sp_model.encode(self.unk_token + text, out_type=str)