update vocab size within add_tokens

huggingface · Jul 29, 2024 · e2aae0d · e2aae0d
1 parent 24f2abb
commit e2aae0d
Showing 1 changed file with 2 additions and 10 deletions.
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
@@ -433,6 +433,7 @@ def __init__(self, **kwargs):
 
         # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
         # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers`
+        self.total_vocab_size = len(self.get_vocab())
         self._add_tokens(
             [token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder],
             special_tokens=True,
@@ -498,14 +499,6 @@ def __len__(self):
         """
         return self.total_vocab_size
 
-    def _update_total_vocab_size(self):
-        """
-        Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because
-        otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and
-        is only updated when adding tokens.
-        """
-        self.total_vocab_size = len(self.get_vocab())
-
     def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
         """
         Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
@@ -579,9 +572,8 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
             self._added_tokens_encoder[token.content] = token_index
             if self.verbose:
                 logger.info(f"Adding {token} to the vocabulary")
-
+            self.total_vocab_size += 1
         self._update_trie()
-        self._update_total_vocab_size()
         return added_tokens
 
     def _update_trie(self, unique_no_split_tokens: Optional[str] = []):