Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pass datasets trust_remote_code #31406

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a015b2b
Pass datasets trust_remote_code
albertvillanova Jun 13, 2024
469af49
Pass trust_remote_code in more tests
albertvillanova Jun 13, 2024
8bff20f
Add trust_remote_dataset_code arg to some tests
albertvillanova Jun 14, 2024
5b92431
Merge remote-tracking branch 'upstream/main' into set-datasets-trust-…
albertvillanova Jun 14, 2024
19513a7
Revert "Temporarily pin datasets upper version to fix CI"
albertvillanova Jun 14, 2024
04fc84a
Pass trust_remote_code in librispeech_asr_dummy docstrings
albertvillanova Jun 14, 2024
8a6b329
Merge remote-tracking branch 'upstream/main' into set-datasets-trust-…
albertvillanova Jun 14, 2024
d0af733
Revert "Pin datasets<2.20.0 for examples"
albertvillanova Jun 14, 2024
e5877bc
Pass trust_remote_code to all examples
albertvillanova Jun 14, 2024
c8e1a96
Revert "Add trust_remote_dataset_code arg to some tests" to research_…
albertvillanova Jun 14, 2024
c5f4ef2
Pass trust_remote_code to tests
albertvillanova Jun 14, 2024
cb319f9
Pass trust_remote_code to docstrings
albertvillanova Jun 14, 2024
4c11530
Fix flax examples tests requirements
albertvillanova Jun 14, 2024
1f9c256
Pass trust_remote_dataset_code arg to tests
albertvillanova Jun 14, 2024
8380ae5
Replace trust_remote_dataset_code with trust_remote_code in one example
albertvillanova Jun 14, 2024
428ccd3
Fix duplicate trust_remote_code
albertvillanova Jun 14, 2024
a56b035
Replace args.trust_remote_dataset_code with args.trust_remote_code
albertvillanova Jun 16, 2024
08c18ac
Replace trust_remote_dataset_code with trust_remote_code in parser
albertvillanova Jun 17, 2024
0c2209d
Replace trust_remote_dataset_code with trust_remote_code in dataclasses
albertvillanova Jun 17, 2024
d92e71b
Replace trust_remote_dataset_code with trust_remote_code arg
albertvillanova Jun 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/flax/_tests_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
datasets >= 1.13.3,<2.20.0 # Temporary upper version
datasets >= 1.13.3
pytest<8.0.1
conllu
nltk
Expand Down
7 changes: 4 additions & 3 deletions examples/flax/image-captioning/run_image_captioning_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -458,6 +458,7 @@ def main():
keep_in_memory=False,
data_dir=data_args.data_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
13 changes: 13 additions & 0 deletions examples/flax/language-modeling/run_bart_dlm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
validation_file: Optional[str] = field(
default=None,
Expand Down Expand Up @@ -518,6 +528,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)

if "validation" not in datasets.keys():
Expand All @@ -528,6 +539,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)
datasets["train"] = load_dataset(
data_args.dataset_name,
Expand All @@ -536,6 +548,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
9 changes: 6 additions & 3 deletions examples/flax/language-modeling/run_clm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -408,6 +408,7 @@ def main():
keep_in_memory=False,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)

if "validation" not in dataset.keys():
Expand All @@ -418,6 +419,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)
dataset["train"] = load_dataset(
data_args.dataset_name,
Expand All @@ -426,6 +428,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
9 changes: 6 additions & 3 deletions examples/flax/language-modeling/run_mlm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -446,6 +446,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)

if "validation" not in datasets.keys():
Expand All @@ -456,6 +457,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)
datasets["train"] = load_dataset(
data_args.dataset_name,
Expand All @@ -464,6 +466,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
13 changes: 13 additions & 0 deletions examples/flax/language-modeling/run_t5_mlm_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
validation_file: Optional[str] = field(
default=None,
Expand Down Expand Up @@ -560,6 +570,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)

if "validation" not in datasets.keys():
Expand All @@ -570,6 +581,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)
datasets["train"] = load_dataset(
data_args.dataset_name,
Expand All @@ -578,6 +590,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
num_proc=data_args.preprocessing_num_workers,
trust_remote_code=data_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
7 changes: 4 additions & 3 deletions examples/flax/question-answering/run_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -498,6 +498,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
# Loading the dataset from local csv or json file.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
trust_remote_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
text_column: Optional[str] = field(
default=None,
metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
Expand Down Expand Up @@ -442,6 +452,7 @@ def main():
cache_dir=data_args.dataset_cache_dir,
num_proc=data_args.preprocessing_num_workers,
token=True if model_args.use_auth_token else None,
trust_remote_code=data_args.trust_remote_code,
)

if training_args.do_eval:
Expand All @@ -452,6 +463,7 @@ def main():
cache_dir=data_args.dataset_cache_dir,
num_proc=data_args.preprocessing_num_workers,
token=True if model_args.use_auth_token else None,
trust_remote_code=data_args.trust_remote_code,
)

if not training_args.do_train and not training_args.do_eval:
Expand Down
7 changes: 4 additions & 3 deletions examples/flax/summarization/run_summarization_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -485,6 +485,7 @@ def main():
cache_dir=model_args.cache_dir,
keep_in_memory=False,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
1 change: 1 addition & 0 deletions examples/flax/test_flax_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ def test_run_flax_speech_recognition_seq2seq(self):
--dataset_config clean
--train_split_name validation
--eval_split_name validation
--trust_remote_code
--output_dir {tmp_dir}
--overwrite_output_dir
--num_train_epochs=2
Expand Down
7 changes: 4 additions & 3 deletions examples/flax/token-classification/run_flax_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -449,6 +449,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
# Loading the dataset from local csv or json file.
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/_tests_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ streamlit
elasticsearch
nltk
pandas
datasets >= 1.13.3,<2.20.0 # Temporary upper version
datasets >= 1.13.3
fire
pytest<8.0.1
conllu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -261,12 +261,14 @@ def main():
data_args.dataset_config_name,
split=data_args.train_split_name,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
raw_datasets["eval"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.eval_split_name,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)

if data_args.audio_column_name not in raw_datasets["train"].column_names:
Expand Down
7 changes: 4 additions & 3 deletions examples/pytorch/contrastive-image-text/run_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -305,6 +305,7 @@ def main():
keep_in_memory=False,
data_dir=data_args.data_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -242,6 +242,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,11 @@ def parse_args():
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
parser.add_argument(
"--trust_remote_code",
type=bool,
default=False,
action="store_true",
help=(
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
),
)
parser.add_argument(
Expand Down Expand Up @@ -284,7 +283,7 @@ def main():
# download the dataset.
if args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
dataset = load_dataset(args.dataset_name)
dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code)
else:
data_files = {}
if args.train_dir is not None:
Expand Down
Loading
Loading