huggingface · amyeroberts · Jun 17, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 14, 2024
diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@@ -136,6 +136,16 @@ class DataTrainingArguments:
     dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
+    trust_remote_dataset_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
     text_column: Optional[str] = field(
         default=None,
         metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
@@ -442,6 +452,7 @@ def main():
             cache_dir=data_args.dataset_cache_dir,
             num_proc=data_args.preprocessing_num_workers,
             token=True if model_args.use_auth_token else None,
+            trust_remote_code=data_args.trust_remote_dataset_code,
         )
 
     if training_args.do_eval:
@@ -452,6 +463,7 @@ def main():
             cache_dir=data_args.dataset_cache_dir,
             num_proc=data_args.preprocessing_num_workers,
             token=True if model_args.use_auth_token else None,
+            trust_remote_code=data_args.trust_remote_dataset_code,
         )
 
     if not training_args.do_train and not training_args.do_eval:

diff --git a/examples/flax/test_flax_examples.py b/examples/flax/test_flax_examples.py
@@ -265,6 +265,7 @@ def test_run_flax_speech_recognition_seq2seq(self):
             --dataset_config clean
             --train_split_name validation
             --eval_split_name validation
+            --trust_remote_dataset_code
             --output_dir {tmp_dir}
             --overwrite_output_dir
             --num_train_epochs=2

diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -71,6 +71,15 @@ def parse_args():
         required=True,
         help="The names of the training data set splits to use (via the datasets library).",
     )
+    parser.add_argument(
+        "--trust_remote_dataset_code",
+        action="store_true",
+        help=(
+            "Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
+            " code, as it will execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--preprocessing_num_workers",
         type=int,
@@ -446,6 +455,7 @@ def main():
             dataset_config_name,
             split=train_split_name,
             cache_dir=args.cache_dir,
+            trust_remote_code=args.trust_remote_dataset_code,
         )
         datasets_splits.append(dataset_split)
 

diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -261,6 +261,16 @@ class DataTrainingArguments:
             )
         },
     )
+    trust_remote_dataset_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
     unk_token: str = field(
         default="[UNK]",
         metadata={"help": "The unk token for the tokenizer"},
@@ -454,6 +464,7 @@ def main():
             data_args.dataset_config_name,
             split=data_args.train_split_name,
             token=data_args.token,
+            trust_remote_code=data_args.trust_remote_dataset_code,
         )
 
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -479,6 +490,7 @@ def main():
             data_args.dataset_config_name,
             split=data_args.eval_split_name,
             token=data_args.token,
+            trust_remote_code=data_args.trust_remote_dataset_code,
         )
 
         if data_args.max_eval_samples is not None:

diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -251,6 +251,16 @@ class DataTrainingArguments:
             )
         },
     )
+    trust_remote_dataset_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
     unk_token: str = field(
         default="[UNK]",
         metadata={"help": "The unk token for the tokenizer"},
@@ -434,6 +444,7 @@ def main():
             data_args.dataset_config_name,
             split=data_args.train_split_name,
             token=data_args.token,
+            trust_remote_code=data_args.trust_remote_dataset_code,
         )
 
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -459,6 +470,7 @@ def main():
             data_args.dataset_config_name,
             split=data_args.eval_split_name,
             token=data_args.token,
+            trust_remote_code=data_args.trust_remote_dataset_code,
         )
 
         if data_args.max_eval_samples is not None:

diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -143,6 +143,16 @@ class DataTrainingArguments:
     dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
+    trust_remote_dataset_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
@@ -347,6 +357,7 @@ def main():
             split=data_args.train_split_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=data_args.trust_remote_dataset_code,
         )
 
     if training_args.do_eval:
@@ -356,6 +367,7 @@ def main():
             split=data_args.eval_split_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=data_args.trust_remote_dataset_code,
         )
 
     if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:

diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py
@@ -424,6 +424,7 @@ def test_run_speech_recognition_ctc(self):
             --dataset_config_name clean
             --train_split_name validation
             --eval_split_name validation
+            --trust_remote_dataset_code
             --do_train
             --do_eval
             --learning_rate 1e-4
@@ -454,6 +455,7 @@ def test_run_speech_recognition_ctc_adapter(self):
             --dataset_config_name clean
             --train_split_name validation
             --eval_split_name validation
+            --trust_remote_dataset_code
             --do_train
             --do_eval
             --learning_rate 1e-4
@@ -486,6 +488,7 @@ def test_run_speech_recognition_seq2seq(self):
             --dataset_config_name clean
             --train_split_name validation
             --eval_split_name validation
+            --trust_remote_dataset_code
             --do_train
             --do_eval
             --learning_rate 1e-4
@@ -547,6 +550,7 @@ def test_run_wav2vec2_pretraining(self):
             --dataset_name hf-internal-testing/librispeech_asr_dummy
             --dataset_config_names clean
             --dataset_split_names validation
+            --trust_remote_dataset_code
             --learning_rate 1e-4
             --per_device_train_batch_size 4
             --per_device_eval_batch_size 4

diff --git a/examples/research_projects/wav2vec2/run_asr.py b/examples/research_projects/wav2vec2/run_asr.py
@@ -104,6 +104,16 @@ class DataTrainingArguments:
             )
         },
     )
+    trust_remote_dataset_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from the dataset defined on the Hub that uses a loading script."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
     target_text_column: Optional[str] = field(
         default="text",
         metadata={"help": "Column in the dataset that contains label (target text). Defaults to 'text'"},
@@ -355,10 +365,16 @@ def main():
     )
 
     train_dataset = datasets.load_dataset(
-        data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
+        data_args.dataset_name,
+        data_args.dataset_config_name,
+        split=data_args.train_split_name,
+        trust_remote_code=data_args.trust_remote_dataset_code,
     )
     val_dataset = datasets.load_dataset(
-        data_args.dataset_name, data_args.dataset_config_name, split=data_args.validation_split_name
+        data_args.dataset_name,
+        data_args.dataset_config_name,
+        split=data_args.validation_split_name,
+        trust_remote_code=data_args.trust_remote_dataset_code,
     )
 
     wer_metric = datasets.load_metric("wer")

diff --git a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py b/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
@@ -157,6 +157,7 @@ def run_trainer(
             --dataset_config_name clean
             --train_split_name validation
             --validation_split_name validation
+            --trust_remote_dataset_code
             --output_dir {output_dir}
             --num_train_epochs {str(num_train_epochs)}
             --per_device_train_batch_size 2

diff --git a/setup.py b/setup.py
@@ -102,7 +102,7 @@
     "codecarbon==1.2.0",
     "cookiecutter==1.7.3",
     "dataclasses",
-    "datasets!=2.5.0,<2.20.0",  # Temporary upper version
+    "datasets!=2.5.0",
     "decord==0.6.0",
     "deepspeed>=0.9.3",
     "diffusers",

diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py
@@ -202,7 +202,9 @@ def get_inputs(self, pt_model, tf_dummy_inputs, config):
         """
 
         def _get_audio_input():
-            ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+            ds = load_dataset(
+                "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+            )
             speech_samples = ds.sort("id").select(range(2))[:2]["audio"]
             raw_samples = [x["array"] for x in speech_samples]
             return raw_samples

diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
@@ -9,7 +9,7 @@
     "codecarbon": "codecarbon==1.2.0",
     "cookiecutter": "cookiecutter==1.7.3",
     "dataclasses": "dataclasses",
-    "datasets": "datasets!=2.5.0,<2.20.0",
+    "datasets": "datasets!=2.5.0",
     "decord": "decord==0.6.0",
     "deepspeed": "deepspeed>=0.9.3",
     "diffusers": "diffusers",

diff --git a/...ls/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/...ls/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
@@ -153,7 +153,9 @@ def test_double_precision_pad(self):
     def _load_datasamples(self, num_samples):
         from datasets import load_dataset
 
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         # automatic decoding with librispeech
         speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
 

diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
@@ -164,7 +164,9 @@ def test_double_precision_pad(self):
 
     # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples
     def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         # automatic decoding with librispeech
         speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
 

diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
@@ -665,7 +665,9 @@ def test_integration_unfused(self):
             "repeat": 0.0023,
         }
 
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        librispeech_dummy = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         audio_sample = librispeech_dummy[-1]
 
         model_id = "laion/clap-htsat-unfused"
@@ -692,7 +694,9 @@ def test_integration_fused(self):
             "pad": -0.000379,
         }
 
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        librispeech_dummy = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         audio_sample = librispeech_dummy[-1]
 
         model_id = "laion/clap-htsat-fused"
@@ -719,7 +723,9 @@ def test_batched_fused(self):
             "pad": 0.0006,
         }
 
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        librispeech_dummy = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
 
         model_id = "laion/clap-htsat-fused"
@@ -746,7 +752,9 @@ def test_batched_unfused(self):
             "pad": 0.0019,
         }
 
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        librispeech_dummy = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
 
         model_id = "laion/clap-htsat-unfused"

diff --git a/tests/models/clvp/test_feature_extraction_clvp.py b/tests/models/clvp/test_feature_extraction_clvp.py
@@ -209,7 +209,9 @@ def test_double_precision_pad(self):
             self.assertTrue(pt_processed.input_features.dtype == torch.float32)
 
     def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         ds = ds.cast_column("audio", Audio(sampling_rate=22050))
         # automatic decoding with librispeech
         speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]

diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
@@ -371,7 +371,9 @@ def get_config(self):
     def prepare_config_and_inputs(self):
         _, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs()
 
-        ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = datasets.load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
         _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
 
@@ -553,7 +555,9 @@ def test_model_from_pretrained(self):
 class ClvpIntegrationTest(unittest.TestCase):
     def setUp(self):
         self.text = "This is an example text."
-        ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = datasets.load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
         _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
 

diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -697,7 +697,9 @@ def test_compute_mask_indices_short_audio(self):
 @slow
 class Data2VecAudioModelIntegrationTest(unittest.TestCase):
     def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         # automatic decoding with librispeech
         speech_samples = ds.sort("id").filter(
             lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]

diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py
@@ -138,7 +138,9 @@ def test_double_precision_pad(self):
     def _load_datasamples(self, num_samples):
         from datasets import load_dataset
 
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
+        )
         # automatic decoding with librispeech
         audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]