Skip to content

Commit

Permalink
fix and overwrite copies
Browse files Browse the repository at this point in the history
  • Loading branch information
leejet committed Jul 28, 2024
1 parent d694042 commit 2a9c0af
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 6 deletions.
3 changes: 1 addition & 2 deletions src/transformers/models/llama/tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,8 @@ def _tokenize(self, text, **kwargs):
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
tokens = self.sp_model.encode(text, out_type=str)
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
return tokens
return self.sp_model.encode(text, out_type=str)

# 1. Encode string + prefix ex: "<unk> Hey"
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -463,9 +463,8 @@ def _tokenize(self, text, **kwargs):
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
tokens = self.sp_model.encode(text, out_type=str)
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
return tokens
return self.sp_model.encode(text, out_type=str)

# 1. Encode string + prefix ex: "<unk> Hey"
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
Expand Down
3 changes: 1 addition & 2 deletions src/transformers/models/udop/tokenization_udop.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,9 +446,8 @@ def _tokenize(self, text, **kwargs):
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
tokens = self.sp_model.encode(text, out_type=str)
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
return tokens
return self.sp_model.encode(text, out_type=str)

# 1. Encode string + prefix ex: "<unk> Hey"
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
Expand Down

0 comments on commit 2a9c0af

Please sign in to comment.