diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py index 493a5e145af9ac..720c1d0847a02a 100644 --- a/src/transformers/models/albert/tokenization_albert.py +++ b/src/transformers/models/albert/tokenization_albert.py @@ -18,7 +18,7 @@ import os import unicodedata from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -102,6 +102,20 @@ class AlbertTokenizer(PreTrainedTokenizer): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -125,11 +139,14 @@ def __init__( pad_token="", cls_token="[CLS]", mask_token="[MASK]", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( do_lower_case=do_lower_case, remove_space=remove_space, @@ -141,6 +158,7 @@ def __init__( pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) @@ -149,7 +167,7 @@ def __init__( self.keep_accents = keep_accents self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -168,7 +186,12 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): @@ -186,14 +209,10 @@ def preprocess_text(self, inputs): return outputs - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Tokenize a string.""" text = self.preprocess_text(text) - - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) + pieces = self.sp_model.encode(text, out_type=str) new_pieces = [] for piece in pieces: if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py index 95d64cfa28d152..36bdbd74499275 100644 --- a/src/transformers/models/barthez/tokenization_barthez.py +++ b/src/transformers/models/barthez/tokenization_barthez.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -89,6 +89,20 @@ class BarthezTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -110,11 +124,14 @@ def __init__( unk_token="", pad_token="", mask_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -123,11 +140,12 @@ def __init__( cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} @@ -219,8 +237,8 @@ def get_vocab(self): vocab.update(self.added_tokens_encoder) return vocab - def _tokenize(self, text): - return self.sp_model.EncodeAsPieces(text) + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" @@ -243,7 +261,12 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py index 795d5f504c22d5..43676e280154dd 100644 --- a/src/transformers/models/bert_generation/tokenization_bert_generation.py +++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -58,6 +58,20 @@ class BertGenerationTokenizer(PreTrainedTokenizer): token instead. pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): The token used for padding, for example when batching sequences of different lengths. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES @@ -74,8 +88,11 @@ def __init__( unk_token="", pad_token="", sep_token="<::::>", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + # Add extra_ids to the special token list super().__init__( bos_token=bos_token, @@ -83,12 +100,13 @@ def __init__( unk_token=unk_token, pad_token=pad_token, sep_token=sep_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -107,16 +125,17 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - return pieces + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py index e3e5a93f6da779..92f652448dae4a 100644 --- a/src/transformers/models/big_bird/tokenization_big_bird.py +++ b/src/transformers/models/big_bird/tokenization_big_bird.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -74,7 +74,20 @@ class BigBirdTokenizer(PreTrainedTokenizer): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES @@ -93,8 +106,9 @@ def __init__( sep_token="[SEP]", mask_token="[MASK]", cls_token="[CLS]", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token @@ -105,6 +119,8 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -113,12 +129,13 @@ def __init__( sep_token=sep_token, mask_token=mask_token, cls_token=cls_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -137,16 +154,17 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - return pieces + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py index b7bee4e19c49cc..ff865c6acda95d 100644 --- a/src/transformers/models/camembert/tokenization_camembert.py +++ b/src/transformers/models/camembert/tokenization_camembert.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -85,6 +85,20 @@ class CamembertTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -107,11 +121,14 @@ def __init__( pad_token="", mask_token="", additional_special_tokens=["NOTUSED", "NOTUSED"], + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -121,9 +138,10 @@ def __init__( pad_token=pad_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual @@ -218,8 +236,8 @@ def get_vocab(self): vocab.update(self.added_tokens_encoder) return vocab - def _tokenize(self, text): - return self.sp_model.EncodeAsPieces(text) + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" @@ -243,7 +261,12 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py index ddb77c621b3613..66c97d4fe8778b 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py @@ -16,7 +16,7 @@ import os import unicodedata -from typing import Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as sp import six @@ -75,6 +75,20 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES @@ -92,8 +106,11 @@ def __init__( pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( do_lower_case=do_lower_case, unk_token=unk_token, @@ -102,6 +119,7 @@ def __init__( cls_token=cls_token, mask_token=mask_token, split_by_punct=split_by_punct, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) @@ -112,7 +130,7 @@ def __init__( ) self.do_lower_case = do_lower_case self.split_by_punct = split_by_punct - self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct) + self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs) @property def vocab_size(self): @@ -127,7 +145,7 @@ def get_vocab(self): vocab.update(self.get_added_vocab()) return vocab - def _tokenize(self, text): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" if self.do_lower_case: text = text.lower() @@ -234,10 +252,34 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = class SPMTokenizer: - def __init__(self, vocab_file, split_by_punct=False): + r""" + Constructs a tokenizer based on `SentencePiece `__. + + Args: + vocab_file (:obj:`str`): + `SentencePiece `__ file (generally has a `.spm` extension) that + contains the vocabulary necessary to instantiate a tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + def __init__(self, vocab_file, split_by_punct=False, sp_model_kwargs: Optional[Dict[str, Any]] = None): self.split_by_punct = split_by_punct self.vocab_file = vocab_file - spm = sp.SentencePieceProcessor() + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) assert os.path.exists(vocab_file) spm.load(vocab_file) bpe_vocab_size = spm.GetPieceSize() @@ -261,7 +303,12 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__ = d - self.spm = sp.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) self.spm.Load(self.vocab_file) def tokenize(self, text): @@ -344,10 +391,10 @@ def _encode_as_pieces(self, text): text = convert_to_unicode(text) if self.split_by_punct: words = self._run_split_on_punc(text) - pieces = [self.spm.encode_as_pieces(w) for w in words] + pieces = [self.spm.encode(w, out_type=str) for w in words] return [p for w in pieces for p in w] else: - return self.spm.encode_as_pieces(text) + return self.spm.encode(text, out_type=str) def split_to_words(self, text): pieces = self._encode_as_pieces(text) diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py index e39fbbd7aac940..93663cd4a6287b 100644 --- a/src/transformers/models/m2m_100/tokenization_m2m_100.py +++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py @@ -16,7 +16,7 @@ from contextlib import contextmanager from pathlib import Path from shutil import copyfile -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import sentencepiece @@ -86,6 +86,20 @@ class M2M100Tokenizer(PreTrainedTokenizer): token instead. pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): The token used for padding, for example when batching sequences of different lengths. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Examples:: @@ -118,8 +132,11 @@ def __init__( sep_token="", pad_token="", unk_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs, - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( src_lang=src_lang, tgt_lang=tgt_lang, @@ -128,6 +145,7 @@ def __init__( sep_token=sep_token, unk_token=unk_token, pad_token=pad_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) @@ -135,7 +153,7 @@ def __init__( self.encoder = load_json(vocab_file) self.decoder = {v: k for k, v in self.encoder.items()} self.spm_file = spm_file - self.sp_model = load_spm(spm_file) + self.sp_model = load_spm(spm_file, self.sp_model_kwargs) self.encoder_size = len(self.encoder) @@ -169,7 +187,7 @@ def src_lang(self, new_src_lang: str) -> None: self.set_src_lang_special_tokens(self._src_lang) def _tokenize(self, text: str) -> List[str]: - return self.sp_model.EncodeAsPieces(text) + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): if token in self.lang_token_to_id: @@ -256,7 +274,12 @@ def __getstate__(self) -> Dict: def __setstate__(self, d: Dict) -> None: self.__dict__ = d - self.sp_model = load_spm(self.spm_file) + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: save_dir = Path(save_directory) @@ -330,8 +353,8 @@ def get_lang_id(self, lang: str) -> int: return self.lang_token_to_id[lang_token] -def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: - spm = sentencepiece.SentencePieceProcessor() +def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor: + spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs) spm.Load(str(path)) return spm diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index 13453f0b58c864..828afd53b9f86c 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -18,7 +18,7 @@ from contextlib import contextmanager from pathlib import Path from shutil import copyfile -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import sentencepiece @@ -82,6 +82,20 @@ class MarianTokenizer(PreTrainedTokenizer): The maximum sentence length the model accepts. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Examples:: @@ -115,8 +129,11 @@ def __init__( eos_token="", pad_token="", model_max_length=512, + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( # bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id source_lang=source_lang, @@ -125,6 +142,7 @@ def __init__( eos_token=eos_token, pad_token=pad_token, model_max_length=model_max_length, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) assert Path(source_spm).exists(), f"cannot find spm source {source_spm}" @@ -140,8 +158,8 @@ def __init__( self.spm_files = [source_spm, target_spm] # load SentencePiece model for pre-processing - self.spm_source = load_spm(source_spm) - self.spm_target = load_spm(target_spm) + self.spm_source = load_spm(source_spm, self.sp_model_kwargs) + self.spm_target = load_spm(target_spm, self.sp_model_kwargs) self.current_spm = self.spm_source # Multilingual target side: default to using first supported language code. @@ -172,7 +190,7 @@ def remove_language_code(self, text: str): def _tokenize(self, text: str) -> List[str]: code, text = self.remove_language_code(text) - pieces = self.current_spm.EncodeAsPieces(text) + pieces = self.current_spm.encode(text, out_type=str) return code + pieces def _convert_id_to_token(self, index: int) -> str: @@ -283,7 +301,12 @@ def __getstate__(self) -> Dict: def __setstate__(self, d: Dict) -> None: self.__dict__ = d - self.spm_source, self.spm_target = (load_spm(f) for f in self.spm_files) + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.spm_source, self.spm_target = (load_spm(f, self.sp_model_kwargs) for f in self.spm_files) self.current_spm = self.spm_source self._setup_normalizer() @@ -308,8 +331,8 @@ def get_special_tokens_mask( return self._special_token_mask(token_ids_0 + token_ids_1) + [1] -def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: - spm = sentencepiece.SentencePieceProcessor() +def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor: + spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs) spm.Load(path) return spm diff --git a/src/transformers/models/mbart/tokenization_mbart50.py b/src/transformers/models/mbart/tokenization_mbart50.py index ef7ec88f244636..6c11f2ab0636b2 100644 --- a/src/transformers/models/mbart/tokenization_mbart50.py +++ b/src/transformers/models/mbart/tokenization_mbart50.py @@ -16,7 +16,7 @@ import os from contextlib import contextmanager from shutil import copyfile -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -76,6 +76,20 @@ class MBart50Tokenizer(PreTrainedTokenizer): mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Examples:: @@ -108,11 +122,14 @@ def __init__( unk_token="", pad_token="", mask_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( src_lang=src_lang, tgt_lang=tgt_lang, @@ -122,10 +139,11 @@ def __init__( cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -177,7 +195,12 @@ def __getstate__(self) -> Dict: def __setstate__(self, d: Dict) -> None: self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def get_vocab(self) -> Dict: @@ -186,7 +209,7 @@ def get_vocab(self) -> Dict: return vocab def _tokenize(self, text: str) -> List[str]: - return self.sp_model.EncodeAsPieces(text) + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token: str) -> int: """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py index 74671c98e3d53c..15f636492388ec 100644 --- a/src/transformers/models/pegasus/tokenization_pegasus.py +++ b/src/transformers/models/pegasus/tokenization_pegasus.py @@ -14,7 +14,7 @@ # limitations under the License. import os from shutil import copyfile -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -77,6 +77,20 @@ class PegasusTokenizer(PreTrainedTokenizer): tokenizer `__ that uses the tokens 2 - 104 only for pretraining + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES @@ -95,10 +109,10 @@ def __init__( mask_token_sent="", additional_special_tokens=None, offset=103, # entries 2 - 104 are only used for pretraining + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: self.offset = offset - if additional_special_tokens is not None: assert isinstance( additional_special_tokens, list @@ -123,6 +137,8 @@ def __init__( additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else [] additional_special_tokens += [f"" for i in range(2, self.offset)] + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( eos_token=eos_token, unk_token=unk_token, @@ -131,11 +147,12 @@ def __init__( mask_token_sent=mask_token_sent, offset=offset, additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.mask_token_sent = mask_token_sent self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) # add special tokens to encoder dict @@ -175,16 +192,17 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - return pieces + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token: str) -> int: """Converts a token (str) to an id using the vocab.""" diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py index 535a93a31ac048..c816e73a7a613c 100644 --- a/src/transformers/models/reformer/tokenization_reformer.py +++ b/src/transformers/models/reformer/tokenization_reformer.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import Dict, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -68,6 +68,20 @@ class ReformerTokenizer(PreTrainedTokenizer): The token used for padding, for example when batching sequences of different lengths. additional_special_tokens (:obj:`List[str]`, `optional`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES @@ -75,16 +89,27 @@ class ReformerTokenizer(PreTrainedTokenizer): max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] - def __init__(self, vocab_file, eos_token="", unk_token="", additional_special_tokens=[], **kwargs): + def __init__( + self, + vocab_file, + eos_token="", + unk_token="", + additional_special_tokens=[], + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( eos_token=eos_token, unk_token=unk_token, additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -103,16 +128,17 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - return pieces + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py index 502021d535793e..de7f05995cccd0 100644 --- a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py +++ b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py @@ -17,7 +17,7 @@ import json from pathlib import Path from shutil import copyfile -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import sentencepiece @@ -79,6 +79,21 @@ class Speech2TextTokenizer(PreTrainedTokenizer): Whether or not to lowercase the input when tokenizing. tgt_lang (:obj:`str`, `optional`): A string representing the target language. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + **kwargs Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer` """ @@ -102,8 +117,11 @@ def __init__( do_lower_case=False, tgt_lang=None, lang_codes=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs, - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -113,6 +131,7 @@ def __init__( do_lower_case=do_lower_case, tgt_lang=tgt_lang, lang_codes=lang_codes, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.do_upper_case = do_upper_case @@ -121,7 +140,7 @@ def __init__( self.encoder = load_json(vocab_file) self.decoder = {v: k for k, v in self.encoder.items()} self.spm_file = spm_file - self.sp_model = load_spm(spm_file) + self.sp_model = load_spm(spm_file, self.sp_model_kwargs) if lang_codes is not None: self.lang_codes = lang_codes @@ -155,7 +174,7 @@ def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None: self.prefix_tokens = [lang_code_id] def _tokenize(self, text: str) -> List[str]: - return self.sp_model.EncodeAsPieces(text) + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): return self.encoder.get(token, self.encoder[self.unk_token]) @@ -221,7 +240,12 @@ def __getstate__(self) -> Dict: def __setstate__(self, d: Dict) -> None: self.__dict__ = d - self.sp_model = load_spm(self.spm_file) + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: save_dir = Path(save_directory) @@ -241,8 +265,8 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (str(vocab_save_path), str(spm_save_path)) -def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: - spm = sentencepiece.SentencePieceProcessor() +def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor: + spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs) spm.Load(str(path)) return spm diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 949aba04ebf216..6daf19d4c8f588 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -19,7 +19,7 @@ import re import warnings from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -81,6 +81,20 @@ class T5Tokenizer(PreTrainedTokenizer): `__). additional_special_tokens (:obj:`List[str]`, `optional`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -100,8 +114,9 @@ def __init__( pad_token="", extra_ids=100, additional_special_tokens=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Add extra_ids to the special token list if extra_ids > 0 and additional_special_tokens is None: additional_special_tokens = [f"" for i in range(extra_ids)] @@ -114,19 +129,22 @@ def __init__( "In this case the additional_special_tokens must include the extra_ids tokens" ) + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, extra_ids=extra_ids, additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.vocab_file = vocab_file self._extra_ids = extra_ids - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -231,16 +249,17 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - return pieces + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py index b2707f8dcb2a7f..c0c8e90c5e0abb 100644 --- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py @@ -16,7 +16,7 @@ import collections import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from ...tokenization_utils import PreTrainedTokenizer from ...utils import logging @@ -96,6 +96,20 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -117,8 +131,11 @@ def __init__( pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -127,6 +144,7 @@ def __init__( pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) @@ -139,7 +157,7 @@ def __init__( ) raise - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -177,7 +195,12 @@ def __setstate__(self, d): "pip install sentencepiece" ) raise - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def get_special_tokens_mask( @@ -241,8 +264,8 @@ def get_vocab(self): vocab.update(self.added_tokens_encoder) return vocab - def _tokenize(self, text): - return self.sp_model.EncodeAsPieces(text) + def _tokenize(self, text: str) -> str: + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py index 9241c4f470fd2b..564f6e50a66f24 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -94,7 +94,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. - sp_model_kwargs (:obj:`dict`, `optional`, defaults to :obj:`None`): + sp_model_kwargs (:obj:`dict`, `optional`): Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece `__ can be used, among other things, to set: @@ -129,9 +129,9 @@ def __init__( unk_token="", pad_token="", mask_token="", - sp_model_kwargs=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token @@ -271,7 +271,7 @@ def get_vocab(self): vocab.update(self.added_tokens_encoder) return vocab - def _tokenize(self, text): + def _tokenize(self, text: str) -> List[str]: return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py index 5137bcfee3b811..afd87e309cfe42 100644 --- a/src/transformers/models/xlnet/tokenization_xlnet.py +++ b/src/transformers/models/xlnet/tokenization_xlnet.py @@ -18,7 +18,7 @@ import os import unicodedata from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -99,6 +99,20 @@ class XLNetTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -124,11 +138,14 @@ def __init__( cls_token="", mask_token="", additional_special_tokens=["", ""], + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( do_lower_case=do_lower_case, remove_space=remove_space, @@ -141,6 +158,7 @@ def __init__( cls_token=cls_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) @@ -151,7 +169,7 @@ def __init__( self.keep_accents = keep_accents self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -170,7 +188,12 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): @@ -188,14 +211,10 @@ def preprocess_text(self, inputs): return outputs - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Tokenize a string.""" text = self.preprocess_text(text) - - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) + pieces = self.sp_model.encode(text, out_type=str) new_pieces = [] for piece in pieces: if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): diff --git a/tests/test_tokenization_albert.py b/tests/test_tokenization_albert.py index 16596524b07761..465fa71d769e74 100644 --- a/tests/test_tokenization_albert.py +++ b/tests/test_tokenization_albert.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -33,6 +32,8 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = AlbertTokenizer rust_tokenizer_class = AlbertTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True + test_sentencepiece_ignore_case = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_barthez.py b/tests/test_tokenization_barthez.py index 1c3a3d18ef3976..e3ba4df9b144a8 100644 --- a/tests/test_tokenization_barthez.py +++ b/tests/test_tokenization_barthez.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import unittest from transformers import BarthezTokenizer, BarthezTokenizerFast, BatchEncoding @@ -24,12 +23,13 @@ @require_tokenizers @require_sentencepiece -@slow +@slow # see https://github.com/huggingface/transformers/issues/11457 class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BarthezTokenizer rust_tokenizer_class = BarthezTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_bert_generation.py b/tests/test_tokenization_bert_generation.py index d1aa93715ae070..e540b98647a9be 100644 --- a/tests/test_tokenization_bert_generation.py +++ b/tests/test_tokenization_bert_generation.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -33,6 +32,7 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BertGenerationTokenizer + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_big_bird.py b/tests/test_tokenization_big_bird.py index 5645eb401ff175..c4d700cad6bd68 100644 --- a/tests/test_tokenization_big_bird.py +++ b/tests/test_tokenization_big_bird.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -36,11 +35,12 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BigBirdTokenizer rust_tokenizer_class = BigBirdTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() - tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer = self.tokenizer_class(SAMPLE_VOCAB, keep_accents=True) tokenizer.save_pretrained(self.tmpdirname) def test_rust_and_python_full_tokenizers(self): diff --git a/tests/test_tokenization_camembert.py b/tests/test_tokenization_camembert.py index 4dc1c88de1f6ad..29faec49250e25 100644 --- a/tests/test_tokenization_camembert.py +++ b/tests/test_tokenization_camembert.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -37,6 +36,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = CamembertTokenizer rust_tokenizer_class = CamembertTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 25213e447c40cc..c8b4bbc21e1882 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -15,6 +15,7 @@ import inspect +import itertools import os import pickle import re @@ -100,6 +101,13 @@ class TokenizerTesterMixin: from_pretrained_vocab_key = "vocab_file" test_seq2seq = True + # set to True to test a sentencepiece tokenizer + test_sentencepiece = False + + # set to True to ignore casing when testing a sentencepiece tokenizer + # test_sentencepiece must also be set to True + test_sentencepiece_ignore_case = False + def setUp(self) -> None: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, rust class, python class, vocab key name) @@ -216,6 +224,38 @@ def convert_batch_encode_plus_format_to_encode_plus(batch_encode_plus_sequences) for i in range(len(batch_encode_plus_sequences["input_ids"])) ] + def test_subword_regularization_tokenizer(self) -> None: + if not self.test_sentencepiece: + return + + # Subword regularization is only available for the slow tokenizer. + sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} + tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs) + + self.assertTrue(hasattr(tokenizer, "sp_model_kwargs")) + self.assertIsNotNone(tokenizer.sp_model_kwargs) + self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict)) + self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs) + self.check_subword_sampling(tokenizer) + + def test_pickle_subword_regularization_tokenizer(self) -> None: + if not self.test_sentencepiece: + return + + """Google pickle __getstate__ __setstate__ if you are struggling with this.""" + # Subword regularization is only available for the slow tokenizer. + sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} + tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs) + tokenizer_bin = pickle.dumps(tokenizer) + del tokenizer + tokenizer_new = pickle.loads(tokenizer_bin) + + self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs")) + self.assertIsNotNone(tokenizer_new.sp_model_kwargs) + self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict)) + self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs) + self.check_subword_sampling(tokenizer_new) + def test_model_input_names_signature(self): accepted_model_main_input_names = [ "input_ids", # nlp models @@ -1727,6 +1767,46 @@ def _check_no_pad_token_padding(self, tokenizer, sequences): # add pad_token_id to pass subsequent tests tokenizer.add_special_tokens({"pad_token": ""}) + def check_subword_sampling( + self, + tokenizer: PreTrainedTokenizer, + text: str = None, + ) -> None: + """ + Check if the tokenizer generates different results when subword regularization is enabled. + + Subword regularization augments training data with subword sampling. + This has a random component. + + Args: + tokenizer: The tokenizer to check. + text: The text to use for the checks. + """ + text = "This is a test for subword regularization." if text is None else text + if self.test_sentencepiece_ignore_case: + text = text.lower() + + tokens_list = [] + for _ in range(5): + tokens_list.append(tokenizer.tokenize(text)) + + # the list of different pairs of tokens_list + combinations = itertools.combinations(tokens_list, 2) + + # check of sampling is done + subword_sampling_found = False + for combination in combinations: + if combination[0] != combination[1]: + subword_sampling_found = True + self.assertTrue(subword_sampling_found) + + # check if converting back to original text works + for tokens in tokens_list: + if self.test_sentencepiece_ignore_case: + self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower()) + else: + self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens)) + @require_torch @slow def test_torch_encode_plus_sent_to_model(self): diff --git a/tests/test_tokenization_deberta_v2.py b/tests/test_tokenization_deberta_v2.py index 2fdf74d003c49e..fbc1c2d10da49f 100644 --- a/tests/test_tokenization_deberta_v2.py +++ b/tests/test_tokenization_deberta_v2.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -33,6 +32,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = DebertaV2Tokenizer rust_tokenizer_class = None test_rust_tokenizer = False + test_sentencepiece = True + test_sentencepiece_ignore_case = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_m2m_100.py b/tests/test_tokenization_m2m_100.py index 4f7cf6ffae5b4f..b151625eeb0fcb 100644 --- a/tests/test_tokenization_m2m_100.py +++ b/tests/test_tokenization_m2m_100.py @@ -45,6 +45,7 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = M2M100Tokenizer test_rust_tokenizer = False test_seq2seq = False + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_marian.py b/tests/test_tokenization_marian.py index 3d9146b11fb6ef..f3986d9c724895 100644 --- a/tests/test_tokenization_marian.py +++ b/tests/test_tokenization_marian.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import tempfile import unittest @@ -50,6 +49,7 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = MarianTokenizer test_rust_tokenizer = False + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_mbart50.py b/tests/test_tokenization_mbart50.py index 49dfc0b66f4664..5d0c4362d3e958 100644 --- a/tests/test_tokenization_mbart50.py +++ b/tests/test_tokenization_mbart50.py @@ -38,6 +38,7 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = MBart50Tokenizer rust_tokenizer_class = MBart50TokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_pegasus.py b/tests/test_tokenization_pegasus.py index 0db2d34cd7f2d3..8b15b339c4d0c6 100644 --- a/tests/test_tokenization_pegasus.py +++ b/tests/test_tokenization_pegasus.py @@ -31,6 +31,7 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = PegasusTokenizer rust_tokenizer_class = PegasusTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() @@ -104,6 +105,7 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = PegasusTokenizer rust_tokenizer_class = PegasusTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_reformer.py b/tests/test_tokenization_reformer.py index 179cf9bcd16a33..1729ba8d9d3766 100644 --- a/tests/test_tokenization_reformer.py +++ b/tests/test_tokenization_reformer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -34,6 +33,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): rust_tokenizer_class = ReformerTokenizerFast test_rust_tokenizer = True test_seq2seq = False + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_speech_to_text.py b/tests/test_tokenization_speech_to_text.py index 2a42b04a5059c4..08a715038885b5 100644 --- a/tests/test_tokenization_speech_to_text.py +++ b/tests/test_tokenization_speech_to_text.py @@ -40,6 +40,7 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = Speech2TextTokenizer test_rust_tokenizer = False + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_t5.py b/tests/test_tokenization_t5.py index 26d8317b5a31fc..be64acf083695c 100644 --- a/tests/test_tokenization_t5.py +++ b/tests/test_tokenization_t5.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import unittest from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast @@ -40,6 +39,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = T5Tokenizer rust_tokenizer_class = T5TokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_xlm_prophetnet.py b/tests/test_tokenization_xlm_prophetnet.py index dd426547ac8692..771bb8c6d38b9c 100644 --- a/tests/test_tokenization_xlm_prophetnet.py +++ b/tests/test_tokenization_xlm_prophetnet.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -32,6 +31,7 @@ class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLMProphetNetTokenizer test_rust_tokenizer = False + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_xlm_roberta.py b/tests/test_tokenization_xlm_roberta.py index b9fe4dde628120..816ad179251366 100644 --- a/tests/test_tokenization_xlm_roberta.py +++ b/tests/test_tokenization_xlm_roberta.py @@ -13,10 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import itertools import os -import pickle import unittest from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast @@ -36,6 +33,7 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLMRobertaTokenizer rust_tokenizer_class = XLMRobertaTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() @@ -120,41 +118,6 @@ def test_full_tokenizer(self): ], ) - def test_subword_regularization_tokenizer(self): - # Subword regularization is only available for the slow tokenizer. - tokenizer = XLMRobertaTokenizer( - SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs={"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} - ) - - # Subword regularization augments training data with subword sampling. - # This has a random component. We test if the tokenizer generates different - # results when subword regularization is enabled. - tokens_list = [] - for _ in range(5): - tokens_list.append(tokenizer.tokenize("This is a test for subword regularization.")) - - # the list of different pairs of tokens_list - combinations = itertools.combinations(tokens_list, 2) - - all_equal = True - for combination in combinations: - if combination[0] != combination[1]: - all_equal = False - - self.assertFalse(all_equal) - - def test_pickle_subword_regularization_tokenizer(self): - """Google pickle __getstate__ __setstate__ if you are struggling with this.""" - # Subword regularization is only available for the slow tokenizer. - sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} - tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs=sp_model_kwargs) - tokenizer_bin = pickle.dumps(tokenizer) - tokenizer_new = pickle.loads(tokenizer_bin) - - self.assertIsNotNone(tokenizer_new.sp_model_kwargs) - self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict)) - self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs) - @cached_property def big_tokenizer(self): return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") diff --git a/tests/test_tokenization_xlnet.py b/tests/test_tokenization_xlnet.py index fb018ec5c25e8d..c7168b38c568fa 100644 --- a/tests/test_tokenization_xlnet.py +++ b/tests/test_tokenization_xlnet.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -33,6 +32,7 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLNetTokenizer rust_tokenizer_class = XLNetTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp()