Skip to content

Commit

Permalink
save total_vocab_size = vocab_size + user added tokens to speed up op…
Browse files Browse the repository at this point in the history
…eration
  • Loading branch information
itazap committed Jul 26, 2024
1 parent 1c7ebf1 commit 24f2abb
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions src/transformers/tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,10 +494,17 @@ def get_added_vocab(self) -> Dict[str, int]:

def __len__(self):
"""
Size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because otherwise if
there is a hole in the vocab, we will add tokenizers at a wrong index.
Size of the full vocabulary with the added tokens.
"""
return len(set(self.get_vocab().keys()))
return self.total_vocab_size

def _update_total_vocab_size(self):
"""
Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because
otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and
is only updated when adding tokens.
"""
self.total_vocab_size = len(self.get_vocab())

def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
"""
Expand Down Expand Up @@ -574,6 +581,7 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
logger.info(f"Adding {token} to the vocabulary")

self._update_trie()
self._update_total_vocab_size()
return added_tokens

def _update_trie(self, unique_no_split_tokens: Optional[str] = []):
Expand Down

0 comments on commit 24f2abb

Please sign in to comment.