Skip to content

Commit

Permalink
py : handle byte tokens in get_token_type (ggerganov#5341)
Browse files Browse the repository at this point in the history
* py : handle byte tokens in `get_token_type`

* py : fix empty bytes arg
  • Loading branch information
ggerganov authored Feb 6, 2024
1 parent 098f6d7 commit 906cff5
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,10 +515,14 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:

# Yield token text, score, and type
yield token_text, self.get_token_score(token_id), self.get_token_type(
token_id, self.special_ids # Reuse already stored special IDs
token_id, token_text, self.special_ids # Reuse already stored special IDs
)

def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
# Special case for byte tokens
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
return gguf.TokenType.BYTE

# Determine token type based on whether it's a special token
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL

Expand All @@ -530,7 +534,7 @@ def get_token_score(self, token_id: int) -> float:
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
if text in self.specials:
toktype = self.get_token_type(self.specials[text], self.special_ids)
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
score = self.get_token_score(self.specials[text])
else:
toktype = gguf.TokenType.USER_DEFINED
Expand Down

0 comments on commit 906cff5

Please sign in to comment.