Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pass datasets trust_remote_code #31406

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a015b2b
Pass datasets trust_remote_code
albertvillanova Jun 13, 2024
469af49
Pass trust_remote_code in more tests
albertvillanova Jun 13, 2024
8bff20f
Add trust_remote_dataset_code arg to some tests
albertvillanova Jun 14, 2024
5b92431
Merge remote-tracking branch 'upstream/main' into set-datasets-trust-…
albertvillanova Jun 14, 2024
19513a7
Revert "Temporarily pin datasets upper version to fix CI"
albertvillanova Jun 14, 2024
04fc84a
Pass trust_remote_code in librispeech_asr_dummy docstrings
albertvillanova Jun 14, 2024
8a6b329
Merge remote-tracking branch 'upstream/main' into set-datasets-trust-…
albertvillanova Jun 14, 2024
d0af733
Revert "Pin datasets<2.20.0 for examples"
albertvillanova Jun 14, 2024
e5877bc
Pass trust_remote_code to all examples
albertvillanova Jun 14, 2024
c8e1a96
Revert "Add trust_remote_dataset_code arg to some tests" to research_…
albertvillanova Jun 14, 2024
c5f4ef2
Pass trust_remote_code to tests
albertvillanova Jun 14, 2024
cb319f9
Pass trust_remote_code to docstrings
albertvillanova Jun 14, 2024
4c11530
Fix flax examples tests requirements
albertvillanova Jun 14, 2024
1f9c256
Pass trust_remote_dataset_code arg to tests
albertvillanova Jun 14, 2024
8380ae5
Replace trust_remote_dataset_code with trust_remote_code in one example
albertvillanova Jun 14, 2024
428ccd3
Fix duplicate trust_remote_code
albertvillanova Jun 14, 2024
a56b035
Replace args.trust_remote_dataset_code with args.trust_remote_code
albertvillanova Jun 16, 2024
08c18ac
Replace trust_remote_dataset_code with trust_remote_code in parser
albertvillanova Jun 17, 2024
0c2209d
Replace trust_remote_dataset_code with trust_remote_code in dataclasses
albertvillanova Jun 17, 2024
d92e71b
Replace trust_remote_dataset_code with trust_remote_code arg
albertvillanova Jun 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
trust_remote_dataset_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
text_column: Optional[str] = field(
default=None,
metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
Expand Down Expand Up @@ -442,6 +452,7 @@ def main():
cache_dir=data_args.dataset_cache_dir,
num_proc=data_args.preprocessing_num_workers,
token=True if model_args.use_auth_token else None,
trust_remote_code=data_args.trust_remote_dataset_code,
)

if training_args.do_eval:
Expand All @@ -452,6 +463,7 @@ def main():
cache_dir=data_args.dataset_cache_dir,
num_proc=data_args.preprocessing_num_workers,
token=True if model_args.use_auth_token else None,
trust_remote_code=data_args.trust_remote_dataset_code,
)

if not training_args.do_train and not training_args.do_eval:
Expand Down
1 change: 1 addition & 0 deletions examples/flax/test_flax_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ def test_run_flax_speech_recognition_seq2seq(self):
--dataset_config clean
--train_split_name validation
--eval_split_name validation
--trust_remote_dataset_code
--output_dir {tmp_dir}
--overwrite_output_dir
--num_train_epochs=2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ def parse_args():
required=True,
help="The names of the training data set splits to use (via the datasets library).",
)
parser.add_argument(
"--trust_remote_dataset_code",
action="store_true",
help=(
"Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
),
)
parser.add_argument(
"--preprocessing_num_workers",
type=int,
Expand Down Expand Up @@ -446,6 +455,7 @@ def main():
dataset_config_name,
split=train_split_name,
cache_dir=args.cache_dir,
trust_remote_code=args.trust_remote_dataset_code,
)
datasets_splits.append(dataset_split)

Expand Down
12 changes: 12 additions & 0 deletions examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,16 @@ class DataTrainingArguments:
)
},
)
trust_remote_dataset_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
unk_token: str = field(
default="[UNK]",
metadata={"help": "The unk token for the tokenizer"},
Expand Down Expand Up @@ -454,6 +464,7 @@ def main():
data_args.dataset_config_name,
split=data_args.train_split_name,
token=data_args.token,
trust_remote_code=data_args.trust_remote_dataset_code,
)

if data_args.audio_column_name not in raw_datasets["train"].column_names:
Expand All @@ -479,6 +490,7 @@ def main():
data_args.dataset_config_name,
split=data_args.eval_split_name,
token=data_args.token,
trust_remote_code=data_args.trust_remote_dataset_code,
)

if data_args.max_eval_samples is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,16 @@ class DataTrainingArguments:
)
},
)
trust_remote_dataset_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
unk_token: str = field(
default="[UNK]",
metadata={"help": "The unk token for the tokenizer"},
Expand Down Expand Up @@ -434,6 +444,7 @@ def main():
data_args.dataset_config_name,
split=data_args.train_split_name,
token=data_args.token,
trust_remote_code=data_args.trust_remote_dataset_code,
)

if data_args.audio_column_name not in raw_datasets["train"].column_names:
Expand All @@ -459,6 +470,7 @@ def main():
data_args.dataset_config_name,
split=data_args.eval_split_name,
token=data_args.token,
trust_remote_code=data_args.trust_remote_dataset_code,
)

if data_args.max_eval_samples is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,16 @@ class DataTrainingArguments:
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
trust_remote_dataset_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
Expand Down Expand Up @@ -347,6 +357,7 @@ def main():
split=data_args.train_split_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=data_args.trust_remote_dataset_code,
)

if training_args.do_eval:
Expand All @@ -356,6 +367,7 @@ def main():
split=data_args.eval_split_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=data_args.trust_remote_dataset_code,
)

if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
Expand Down
4 changes: 4 additions & 0 deletions examples/pytorch/test_pytorch_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ def test_run_speech_recognition_ctc(self):
--dataset_config_name clean
--train_split_name validation
--eval_split_name validation
--trust_remote_dataset_code
--do_train
--do_eval
--learning_rate 1e-4
Expand Down Expand Up @@ -454,6 +455,7 @@ def test_run_speech_recognition_ctc_adapter(self):
--dataset_config_name clean
--train_split_name validation
--eval_split_name validation
--trust_remote_dataset_code
--do_train
--do_eval
--learning_rate 1e-4
Expand Down Expand Up @@ -486,6 +488,7 @@ def test_run_speech_recognition_seq2seq(self):
--dataset_config_name clean
--train_split_name validation
--eval_split_name validation
--trust_remote_dataset_code
--do_train
--do_eval
--learning_rate 1e-4
Expand Down Expand Up @@ -547,6 +550,7 @@ def test_run_wav2vec2_pretraining(self):
--dataset_name hf-internal-testing/librispeech_asr_dummy
--dataset_config_names clean
--dataset_split_names validation
--trust_remote_dataset_code
--learning_rate 1e-4
--per_device_train_batch_size 4
--per_device_eval_batch_size 4
Expand Down
20 changes: 18 additions & 2 deletions examples/research_projects/wav2vec2/run_asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,16 @@ class DataTrainingArguments:
)
},
)
trust_remote_dataset_code: bool = field(
default=False,
metadata={
"help": (
"Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
target_text_column: Optional[str] = field(
default="text",
metadata={"help": "Column in the dataset that contains label (target text). Defaults to 'text'"},
Expand Down Expand Up @@ -355,10 +365,16 @@ def main():
)

train_dataset = datasets.load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.train_split_name,
trust_remote_code=data_args.trust_remote_dataset_code,
)
val_dataset = datasets.load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.validation_split_name
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.validation_split_name,
trust_remote_code=data_args.trust_remote_dataset_code,
)

wer_metric = datasets.load_metric("wer")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def run_trainer(
--dataset_config_name clean
--train_split_name validation
--validation_split_name validation
--trust_remote_dataset_code
--output_dir {output_dir}
--num_train_epochs {str(num_train_epochs)}
--per_device_train_batch_size 2
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
"codecarbon==1.2.0",
"cookiecutter==1.7.3",
"dataclasses",
"datasets!=2.5.0,<2.20.0", # Temporary upper version
"datasets!=2.5.0",
"decord==0.6.0",
"deepspeed>=0.9.3",
"diffusers",
Expand Down
4 changes: 3 additions & 1 deletion src/transformers/commands/pt_to_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,9 @@ def get_inputs(self, pt_model, tf_dummy_inputs, config):
"""

def _get_audio_input():
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
speech_samples = ds.sort("id").select(range(2))[:2]["audio"]
raw_samples = [x["array"] for x in speech_samples]
return raw_samples
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/dependency_versions_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"codecarbon": "codecarbon==1.2.0",
"cookiecutter": "cookiecutter==1.7.3",
"dataclasses": "dataclasses",
"datasets": "datasets!=2.5.0,<2.20.0",
"datasets": "datasets!=2.5.0",
"decord": "decord==0.6.0",
"deepspeed": "deepspeed>=0.9.3",
"diffusers": "diffusers",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ def test_double_precision_pad(self):
def _load_datasamples(self, num_samples):
from datasets import load_dataset

ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]

Expand Down
4 changes: 3 additions & 1 deletion tests/models/clap/test_feature_extraction_clap.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ def test_double_precision_pad(self):

# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples
def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]

Expand Down
16 changes: 12 additions & 4 deletions tests/models/clap/test_modeling_clap.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,9 @@ def test_integration_unfused(self):
"repeat": 0.0023,
}

librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_sample = librispeech_dummy[-1]

model_id = "laion/clap-htsat-unfused"
Expand All @@ -692,7 +694,9 @@ def test_integration_fused(self):
"pad": -0.000379,
}

librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_sample = librispeech_dummy[-1]

model_id = "laion/clap-htsat-fused"
Expand All @@ -719,7 +723,9 @@ def test_batched_fused(self):
"pad": 0.0006,
}

librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]

model_id = "laion/clap-htsat-fused"
Expand All @@ -746,7 +752,9 @@ def test_batched_unfused(self):
"pad": 0.0019,
}

librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
librispeech_dummy = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]

model_id = "laion/clap-htsat-unfused"
Expand Down
4 changes: 3 additions & 1 deletion tests/models/clvp/test_feature_extraction_clvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,9 @@ def test_double_precision_pad(self):
self.assertTrue(pt_processed.input_features.dtype == torch.float32)

def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
ds = ds.cast_column("audio", Audio(sampling_rate=22050))
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
Expand Down
8 changes: 6 additions & 2 deletions tests/models/clvp/test_modeling_clvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,9 @@ def get_config(self):
def prepare_config_and_inputs(self):
_, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs()

ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = datasets.load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
_, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()

Expand Down Expand Up @@ -553,7 +555,9 @@ def test_model_from_pretrained(self):
class ClvpIntegrationTest(unittest.TestCase):
def setUp(self):
self.text = "This is an example text."
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = datasets.load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
_, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()

Expand Down
4 changes: 3 additions & 1 deletion tests/models/data2vec/test_modeling_data2vec_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,9 @@ def test_compute_mask_indices_short_audio(self):
@slow
class Data2VecAudioModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
speech_samples = ds.sort("id").filter(
lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
Expand Down
4 changes: 3 additions & 1 deletion tests/models/encodec/test_feature_extraction_encodec.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,9 @@ def test_double_precision_pad(self):
def _load_datasamples(self, num_samples):
from datasets import load_dataset

ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
# automatic decoding with librispeech
audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]

Expand Down
Loading
Loading