Skip to content

Commit

Permalink
Pass datasets trust_remote_code (#31406)
Browse files Browse the repository at this point in the history
* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b767282.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
  • Loading branch information
albertvillanova authored Jun 17, 2024
1 parent 485fd81 commit a14b055
Show file tree
Hide file tree
Showing 168 changed files with 804 additions and 410 deletions.
2 changes: 1 addition & 1 deletion examples/flax/_tests_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
datasets >= 1.13.3,<2.20.0 # Temporary upper version
datasets >= 1.13.3
pytest<8.0.1
conllu
nltk
Expand Down
7 changes: 4 additions & 3 deletions examples/flax/image-captioning/run_image_captioning_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -458,6 +458,7 @@ def main():
keep_in_memory=False,
data_dir=data_args.data_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
13 changes: 13 additions & 0 deletions examples/flax/language-modeling/run_bart_dlm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
validation_file: Optional[str] = field(
default=None,
Expand Down Expand Up @@ -518,6 +528,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)

if "validation" not in datasets.keys():
Expand All @@ -528,6 +539,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)
datasets["train"] = load_dataset(
data_args.dataset_name,
Expand All @@ -536,6 +548,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
9 changes: 6 additions & 3 deletions examples/flax/language-modeling/run_clm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -408,6 +408,7 @@ def main():
keep_in_memory=False,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)

if "validation" not in dataset.keys():
Expand All @@ -418,6 +419,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)
dataset["train"] = load_dataset(
data_args.dataset_name,
Expand All @@ -426,6 +428,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
9 changes: 6 additions & 3 deletions examples/flax/language-modeling/run_mlm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -446,6 +446,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)

if "validation" not in datasets.keys():
Expand All @@ -456,6 +457,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)
datasets["train"] = load_dataset(
data_args.dataset_name,
Expand All @@ -464,6 +466,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
13 changes: 13 additions & 0 deletions examples/flax/language-modeling/run_t5_mlm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
validation_file: Optional[str] = field(
default=None,
Expand Down Expand Up @@ -560,6 +570,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)

if "validation" not in datasets.keys():
Expand All @@ -570,6 +581,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)
datasets["train"] = load_dataset(
data_args.dataset_name,
Expand All @@ -578,6 +590,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
7 changes: 4 additions & 3 deletions examples/flax/question-answering/run_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -498,6 +498,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
# Loading the dataset from local csv or json file.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
text_column: Optional[str] = field(
default=None,
metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
Expand Down Expand Up @@ -442,6 +452,7 @@ def main():
cache_dir=data_args.dataset_cache_dir,
num_proc=data_args.preprocessing_num_workers,
token=True if model_args.use_auth_token else None,
trust_remote_code=data_args.trust_remote_code,
)

if training_args.do_eval:
Expand All @@ -452,6 +463,7 @@ def main():
cache_dir=data_args.dataset_cache_dir,
num_proc=data_args.preprocessing_num_workers,
token=True if model_args.use_auth_token else None,
trust_remote_code=data_args.trust_remote_code,
)

if not training_args.do_train and not training_args.do_eval:
Expand Down
7 changes: 4 additions & 3 deletions examples/flax/summarization/run_summarization_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -485,6 +485,7 @@ def main():
cache_dir=model_args.cache_dir,
keep_in_memory=False,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
1 change: 1 addition & 0 deletions examples/flax/test_flax_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ def test_run_flax_speech_recognition_seq2seq(self):
--dataset_config clean
--train_split_name validation
--eval_split_name validation
--trust_remote_code
--output_dir {tmp_dir}
--overwrite_output_dir
--num_train_epochs=2
Expand Down
7 changes: 4 additions & 3 deletions examples/flax/token-classification/run_flax_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -449,6 +449,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
# Loading the dataset from local csv or json file.
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/_tests_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ streamlit
elasticsearch
nltk
pandas
datasets >= 1.13.3,<2.20.0 # Temporary upper version
datasets >= 1.13.3
fire
pytest<8.0.1
conllu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -261,12 +261,14 @@ def main():
data_args.dataset_config_name,
split=data_args.train_split_name,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
raw_datasets["eval"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.eval_split_name,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)

if data_args.audio_column_name not in raw_datasets["train"].column_names:
Expand Down
7 changes: 4 additions & 3 deletions examples/pytorch/contrastive-image-text/run_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -305,6 +305,7 @@ def main():
keep_in_memory=False,
data_dir=data_args.data_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -242,6 +242,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument(
"--trust_remote_code",
type=bool,
default=False,
action="store_true",
help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
),
)
parser.add_argument(
Expand Down Expand Up @@ -284,7 +283,7 @@ def main():
# download the dataset.
if args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
dataset = load_dataset(args.dataset_name)
dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code)
else:
data_files = {}
if args.train_dir is not None:
Expand Down
Loading

0 comments on commit a14b055

Please sign in to comment.