Skip to content

Commit

Permalink
update vocab size within add_tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
itazap committed Jul 29, 2024
1 parent 24f2abb commit e2aae0d
Showing 1 changed file with 2 additions and 10 deletions.
12 changes: 2 additions & 10 deletions src/transformers/tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ def __init__(self, **kwargs):

# 4. If some of the special tokens are not part of the vocab, we add them, at the end.
# the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers`
self.total_vocab_size = len(self.get_vocab())
self._add_tokens(
[token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder],
special_tokens=True,
Expand Down Expand Up @@ -498,14 +499,6 @@ def __len__(self):
"""
return self.total_vocab_size

def _update_total_vocab_size(self):
"""
Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because
otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and
is only updated when adding tokens.
"""
self.total_vocab_size = len(self.get_vocab())

def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
"""
Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
Expand Down Expand Up @@ -579,9 +572,8 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
self._added_tokens_encoder[token.content] = token_index
if self.verbose:
logger.info(f"Adding {token} to the vocabulary")

self.total_vocab_size += 1
self._update_trie()
self._update_total_vocab_size()
return added_tokens

def _update_trie(self, unique_no_split_tokens: Optional[str] = []):
Expand Down

0 comments on commit e2aae0d

Please sign in to comment.