Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Honor trust_remote_code for custom tokenizers #28854

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,7 +799,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
_ = kwargs.pop("code_revision", None)
if os.path.isdir(pretrained_model_name_or_path):
tokenizer_class.register_for_auto_class()
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
return tokenizer_class.from_pretrained(
pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs
)
elif config_tokenizer_class is not None:
tokenizer_class = None
if use_fast and not config_tokenizer_class.endswith("Fast"):
Expand Down
12 changes: 12 additions & 0 deletions src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1810,6 +1810,7 @@ def from_pretrained(
local_files_only: bool = False,
token: Optional[Union[str, bool]] = None,
revision: str = "main",
trust_remote_code=False,
**kwargs,
):
r"""
Expand Down Expand Up @@ -1855,6 +1856,10 @@ def from_pretrained(
facebook/rag-token-base), specify it here.
inputs (additional positional arguments, *optional*):
Will be passed along to the Tokenizer `__init__` method.
trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
should only be set to `True` for repositories you trust and in which you have read the code, as it will
execute code present on the Hub on your local machine.
kwargs (additional keyword arguments, *optional*):
Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like `bos_token`,
`eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
Expand Down Expand Up @@ -2038,6 +2043,7 @@ def from_pretrained(
local_files_only=local_files_only,
_commit_hash=commit_hash,
_is_local=is_local,
trust_remote_code=trust_remote_code,
**kwargs,
)

Expand All @@ -2053,6 +2059,7 @@ def _from_pretrained(
local_files_only=False,
_commit_hash=None,
_is_local=False,
trust_remote_code=False,
**kwargs,
):
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
Expand Down Expand Up @@ -2101,6 +2108,10 @@ def _from_pretrained(
)

if config_tokenizer_class is None:
# Matt: This entire block is only used to decide if the tokenizer class matches the class in the repo.
# If not, it raises a warning, but otherwise continues. Since we mostly load tokenizers with
# AutoTokenizer these days, it seems like a lot of work (and a source of bugs) for little gain.
# Maybe we can just remove this entirely?
from .models.auto.configuration_auto import AutoConfig # tests_ignore

# Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
Expand All @@ -2110,6 +2121,7 @@ def _from_pretrained(
token=token,
cache_dir=cache_dir,
local_files_only=local_files_only,
trust_remote_code=trust_remote_code,
_commit_hash=_commit_hash,
)
config_tokenizer_class = config.tokenizer_class
Expand Down
71 changes: 71 additions & 0 deletions tests/models/auto/test_tokenization_auto.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lgtm, not that bad to not push to the hub that way we see exactly what code was used 😉

Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import shutil
import sys
Expand Down Expand Up @@ -427,3 +428,73 @@ def test_cached_tokenizer_has_minimum_calls_to_head(self):
self.assertEqual(counter["GET"], 0)
self.assertEqual(counter["HEAD"], 1)
self.assertEqual(counter.total_calls, 1)

def test_init_tokenizer_with_trust(self):
nop_tokenizer_code = """
import transformers

class NopTokenizer(transformers.PreTrainedTokenizer):
def get_vocab(self):
return {}
"""

nop_config_code = """
from transformers import PretrainedConfig

class NopConfig(PretrainedConfig):
model_type = "test_unregistered_dynamic"

def __init__(self, **kwargs):
super().__init__(**kwargs)
"""

with tempfile.TemporaryDirectory() as tmp_dir:
fake_model_id = "hf-internal-testing/test_unregistered_dynamic"
fake_repo = os.path.join(tmp_dir, fake_model_id)
os.makedirs(fake_repo)

tokenizer_src_file = os.path.join(fake_repo, "tokenizer.py")
with open(tokenizer_src_file, "w") as wfp:
wfp.write(nop_tokenizer_code)

model_config_src_file = os.path.join(fake_repo, "config.py")
with open(model_config_src_file, "w") as wfp:
wfp.write(nop_config_code)

config = {
"model_type": "test_unregistered_dynamic",
"auto_map": {"AutoConfig": f"{fake_model_id}--config.NopConfig"},
}

config_file = os.path.join(fake_repo, "config.json")
with open(config_file, "w") as wfp:
json.dump(config, wfp, indent=2)

tokenizer_config = {
"auto_map": {
"AutoTokenizer": [
f"{fake_model_id}--tokenizer.NopTokenizer",
None,
]
}
}

tokenizer_config_file = os.path.join(fake_repo, "tokenizer_config.json")
with open(tokenizer_config_file, "w") as wfp:
json.dump(tokenizer_config, wfp, indent=2)

prev_dir = os.getcwd()
try:
# it looks like subdir= is broken in the from_pretrained also, so this is necessary
os.chdir(tmp_dir)

# this should work because we trust the code
_ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=True)
try:
# this should fail because we don't trust and we're not at a terminal for interactive response
_ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=False)
self.fail("AutoTokenizer.from_pretrained with trust_remote_code=False should raise ValueException")
except ValueError:
pass
finally:
os.chdir(prev_dir)
Loading