diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index d33be2789761..b3593d40558e 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -526,7 +526,15 @@ def forward(self, x): return self.ln2(x + h + self.bias) def get_regression_trainer( - a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, keep_report_to=False, **kwargs + a=0, + b=0, + double_output=False, + train_len=64, + eval_len=64, + pretrained=True, + keep_report_to=False, + output_dir=None, + **kwargs, ): label_names = kwargs.get("label_names", None) gradient_checkpointing = kwargs.get("gradient_checkpointing", False) @@ -552,9 +560,8 @@ def get_regression_trainer( compute_metrics = kwargs.pop("compute_metrics", None) data_collator = kwargs.pop("data_collator", None) optimizers = kwargs.pop("optimizers", (None, None)) - output_dir = kwargs.pop("output_dir", "./regression") preprocess_logits_for_metrics = kwargs.pop("preprocess_logits_for_metrics", None) - + assert output_dir is not None, "output_dir should be specified for testing" args = RegressionTrainingArguments(output_dir, a=a, b=b, keep_report_to=keep_report_to, **kwargs) return Trainer( model, @@ -678,13 +685,15 @@ def setUp(self): args = TrainingArguments("..") self.n_epochs = args.num_train_epochs self.batch_size = args.train_batch_size - trainer = get_regression_trainer(learning_rate=0.1) - trainer.train() - self.default_trained_model = (trainer.model.a, trainer.model.b) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir) + trainer.train() + self.default_trained_model = (trainer.model.a, trainer.model.b) - trainer = get_regression_trainer(learning_rate=0.1, seed=314) - trainer.train() - self.alternate_trained_model = (trainer.model.a, trainer.model.b) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(learning_rate=0.1, seed=314, output_dir=tmp_dir) + trainer.train() + self.alternate_trained_model = (trainer.model.a, trainer.model.b) def check_trained_model(self, model, alternate_seed=False): # Checks a training seeded with learning_rate = 0.1 @@ -694,14 +703,16 @@ def check_trained_model(self, model, alternate_seed=False): def test_reproducible_training(self): # Checks that training worked, model trained and seed made a reproducible training. - trainer = get_regression_trainer(learning_rate=0.1) - trainer.train() - self.check_trained_model(trainer.model) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir) + trainer.train() + self.check_trained_model(trainer.model) # Checks that a different seed gets different (reproducible) results. - trainer = get_regression_trainer(learning_rate=0.1, seed=314) - trainer.train() - self.check_trained_model(trainer.model, alternate_seed=True) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(learning_rate=0.1, seed=314, output_dir=tmp_dir) + trainer.train() + self.check_trained_model(trainer.model, alternate_seed=True) def test_trainer_with_datasets(self): import datasets @@ -713,41 +724,43 @@ def test_trainer_with_datasets(self): # Base training. Should have the same results as test_reproducible_training model = RegressionModel() - args = TrainingArguments("./regression", learning_rate=0.1, report_to="none") - trainer = Trainer(model, args, train_dataset=train_dataset) - trainer.train() - self.check_trained_model(trainer.model) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(tmp_dir, learning_rate=0.1, report_to="none") + trainer = Trainer(model, args, train_dataset=train_dataset) + trainer.train() + self.check_trained_model(trainer.model) - # Can return tensors. - train_dataset.set_format(type="torch", dtype=torch.float32) - model = RegressionModel() - trainer = Trainer(model, args, train_dataset=train_dataset) - trainer.train() - self.check_trained_model(trainer.model) + # Can return tensors. + train_dataset.set_format(type="torch", dtype=torch.float32) + model = RegressionModel() + trainer = Trainer(model, args, train_dataset=train_dataset) + trainer.train() + self.check_trained_model(trainer.model) - # Adding one column not used by the model should have no impact - z = np.random.normal(size=(64,)).astype(np.float32) - train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z}) - model = RegressionModel() - trainer = Trainer(model, args, train_dataset=train_dataset) - trainer.train() - self.check_trained_model(trainer.model) + # Adding one column not used by the model should have no impact + z = np.random.normal(size=(64,)).astype(np.float32) + train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z}) + model = RegressionModel() + trainer = Trainer(model, args, train_dataset=train_dataset) + trainer.train() + self.check_trained_model(trainer.model) def test_model_init(self): train_dataset = RegressionDataset() - args = TrainingArguments("./regression", learning_rate=0.1, report_to="none") - trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel()) - trainer.train() - self.check_trained_model(trainer.model) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(tmp_dir, learning_rate=0.1, report_to="none") + trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel()) + trainer.train() + self.check_trained_model(trainer.model) - # Re-training should restart from scratch, thus lead the same results. - trainer.train() - self.check_trained_model(trainer.model) + # Re-training should restart from scratch, thus lead the same results. + trainer.train() + self.check_trained_model(trainer.model) - # Re-training should restart from scratch, thus lead the same results and new seed should be used. - trainer.args.seed = 314 - trainer.train() - self.check_trained_model(trainer.model, alternate_seed=True) + # Re-training should restart from scratch, thus lead the same results and new seed should be used. + trainer.args.seed = 314 + trainer.train() + self.check_trained_model(trainer.model, alternate_seed=True) @slow def test_gradient_accumulation_loss_alignment_with_model_loss(self): @@ -782,63 +795,67 @@ def tokenize_function(examples): "disable_tqdm": True, } - args = TrainingArguments( - "./generation", - **args_kwargs, - ) - trainer = Trainer( - model, - args, - train_dataset=tokenized_dataset["train"], - callbacks=[base_loss_callback], - data_collator=data_collator, - ) - assert trainer.model_accepts_loss_kwargs - trainer.train() + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + **args_kwargs, + ) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[base_loss_callback], + data_collator=data_collator, + ) + assert trainer.model_accepts_loss_kwargs + trainer.train() grad_accum_loss_callback = StoreLossCallback() - args = TrainingArguments( - "./generation", - **args_kwargs, - gradient_accumulation_steps=2, - per_device_train_batch_size=4, - ) - set_seed(42) - model = AutoModelForCausalLM.from_pretrained(model_name) - trainer = Trainer( - model, - args, - train_dataset=tokenized_dataset["train"], - callbacks=[grad_accum_loss_callback], - data_collator=data_collator, - ) - trainer.train() + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + **args_kwargs, + gradient_accumulation_steps=2, + per_device_train_batch_size=4, + ) + set_seed(42) + model = AutoModelForCausalLM.from_pretrained(model_name) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[grad_accum_loss_callback], + data_collator=data_collator, + ) + trainer.train() - set_seed(42) - model = AutoModelForCausalLM.from_pretrained(model_name) - broken_loss_callback = StoreLossCallback() - trainer = Trainer( - model, - args, - train_dataset=tokenized_dataset["train"], - callbacks=[broken_loss_callback], - data_collator=data_collator, - ) - # disable model_accepts_loss_kwargs - trainer.model_accepts_loss_kwargs = False - trainer.train() + set_seed(42) + model = AutoModelForCausalLM.from_pretrained(model_name) + broken_loss_callback = StoreLossCallback() + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[broken_loss_callback], + data_collator=data_collator, + ) + # disable model_accepts_loss_kwargs + trainer.model_accepts_loss_kwargs = False + trainer.train() - # Calculate the difference between the base loss and the grad_accum loss - diff_truth = [ - abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses) - ] - diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)] + # Calculate the difference between the base loss and the grad_accum loss + diff_truth = [ + abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses) + ] + diff_broken = [ + abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses) + ] - # all diff truth should be quite close - self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01") + # all diff truth should be quite close + self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01") - # max diff broken should be very off - self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3") + # max diff broken should be very off + self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3") @slow def test_gradient_accumulation_loss_alignment_with_loss_func(self): @@ -879,125 +896,135 @@ def compute_loss(logits, labels, vocab_size, num_items_in_batch, disable_num_ite "disable_tqdm": True, } - args = TrainingArguments( - "./generation", - **args_kwargs, - ) - trainer = Trainer( - model, - args, - train_dataset=tokenized_dataset["train"], - callbacks=[base_loss_callback], - compute_loss_func=loss_fn, - data_collator=data_collator, - ) - trainer.train() + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + **args_kwargs, + ) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[base_loss_callback], + compute_loss_func=loss_fn, + data_collator=data_collator, + ) + trainer.train() grad_accum_loss_callback = StoreLossCallback() - args = TrainingArguments( - "./generation", - **args_kwargs, - gradient_accumulation_steps=2, - per_device_train_batch_size=4, - ) - set_seed(42) - model = AutoModelForCausalLM.from_pretrained(model_name) - trainer = Trainer( - model, - args, - train_dataset=tokenized_dataset["train"], - callbacks=[grad_accum_loss_callback], - compute_loss_func=loss_fn, - data_collator=data_collator, - ) - trainer.train() + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + **args_kwargs, + gradient_accumulation_steps=2, + per_device_train_batch_size=4, + ) + set_seed(42) + model = AutoModelForCausalLM.from_pretrained(model_name) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[grad_accum_loss_callback], + compute_loss_func=loss_fn, + data_collator=data_collator, + ) + trainer.train() - set_seed(42) - model = AutoModelForCausalLM.from_pretrained(model_name) - broken_loss_callback = StoreLossCallback() - loss_fn = partial(compute_loss, vocab_size=model.config.vocab_size, disable_num_items_in_batch=True) - trainer = Trainer( - model, - args, - train_dataset=tokenized_dataset["train"], - callbacks=[broken_loss_callback], - compute_loss_func=loss_fn, - data_collator=data_collator, - ) - trainer.train() + set_seed(42) + model = AutoModelForCausalLM.from_pretrained(model_name) + broken_loss_callback = StoreLossCallback() + loss_fn = partial(compute_loss, vocab_size=model.config.vocab_size, disable_num_items_in_batch=True) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[broken_loss_callback], + compute_loss_func=loss_fn, + data_collator=data_collator, + ) + trainer.train() - # Calculate the difference between the base loss and the grad_accum loss - diff_truth = [ - abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses) - ] - diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)] + # Calculate the difference between the base loss and the grad_accum loss + diff_truth = [ + abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses) + ] + diff_broken = [ + abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses) + ] - # all diff truth should be quite close - self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01") + # all diff truth should be quite close + self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01") - # max diff broken should be very off - self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3") + # max diff broken should be very off + self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3") def test_gradient_accumulation(self): # Training with half the batch size but accumulation steps as 2 should give the same training losses. - trainer = get_regression_trainer( - gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1 - ) - trainer.train() - self.check_trained_model(trainer.model) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1, output_dir=tmp_dir + ) + trainer.train() + self.check_trained_model(trainer.model) def test_gradient_checkpointing(self): - trainer = get_regression_trainer( - per_device_train_batch_size=1, - learning_rate=0.1, - gradient_checkpointing=True, - gradient_checkpointing_kwargs={"use_reentrant": False}, - ) - previous_params = {k: v.detach().clone() for k, v in trainer.model.named_parameters()} + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + per_device_train_batch_size=1, + learning_rate=0.1, + gradient_checkpointing=True, + gradient_checkpointing_kwargs={"use_reentrant": False}, + output_dir=tmp_dir, + ) + previous_params = {k: v.detach().clone() for k, v in trainer.model.named_parameters()} - trainer.train() + trainer.train() - # Check if model weights have been updated - for k, v in trainer.model.named_parameters(): - self.assertFalse( - torch.allclose(previous_params[k], v, rtol=1e-4, atol=1e-4), - f"Model weights for {k} have not been updated", - ) + # Check if model weights have been updated + for k, v in trainer.model.named_parameters(): + self.assertFalse( + torch.allclose(previous_params[k], v, rtol=1e-4, atol=1e-4), + f"Model weights for {k} have not been updated", + ) def test_training_loss(self): n_gpus = max(1, backend_device_count(torch_device)) # With even logs - trainer = get_regression_trainer(logging_steps=64 / (8 * n_gpus)) - trainer.train() - log_history = trainer.state.log_history + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(logging_steps=64 / (8 * n_gpus), output_dir=tmp_dir) + trainer.train() + log_history = trainer.state.log_history - losses = [log["loss"] for log in log_history if "loss" in log] - train_loss = log_history[-1]["train_loss"] - self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4) + losses = [log["loss"] for log in log_history if "loss" in log] + train_loss = log_history[-1]["train_loss"] + self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4) # With uneven logs - trainer = get_regression_trainer(logging_steps=5) - trainer.train() - log_history = trainer.state.log_history + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(logging_steps=5, output_dir=tmp_dir) + trainer.train() + log_history = trainer.state.log_history - # Training loss should be the same as before - new_train_loss = log_history[-1]["train_loss"] - self.assertAlmostEqual(train_loss, new_train_loss, places=4) + # Training loss should be the same as before + new_train_loss = log_history[-1]["train_loss"] + self.assertAlmostEqual(train_loss, new_train_loss, places=4) def test_custom_optimizer(self): train_dataset = RegressionDataset() - args = TrainingArguments("./regression", report_to="none") - model = RegressionModel() - optimizer = torch.optim.SGD(model.parameters(), lr=1.0) - lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0) - trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)) - trainer.train() + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(tmp_dir, report_to="none") + model = RegressionModel() + optimizer = torch.optim.SGD(model.parameters(), lr=1.0) + lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0) + trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)) + trainer.train() - (a, b) = self.default_trained_model - self.assertFalse(torch.allclose(trainer.model.a, a)) - self.assertFalse(torch.allclose(trainer.model.b, b)) - self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0) + (a, b) = self.default_trained_model + self.assertFalse(torch.allclose(trainer.model.a, a)) + self.assertFalse(torch.allclose(trainer.model.b, b)) + self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0) def test_lr_scheduler_kwargs(self): # test scheduler kwargs passed via TrainingArguments @@ -1005,74 +1032,81 @@ def test_lr_scheduler_kwargs(self): model = RegressionModel() num_steps, num_warmup_steps = 10, 2 extra_kwargs = {"power": 5.0, "lr_end": 1e-5} # Non-default arguments - args = TrainingArguments( - "./regression", - lr_scheduler_type="polynomial", - lr_scheduler_kwargs=extra_kwargs, - learning_rate=0.2, - warmup_steps=num_warmup_steps, - report_to="none", - ) - trainer = Trainer(model, args, train_dataset=train_dataset) - trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + lr_scheduler_type="polynomial", + lr_scheduler_kwargs=extra_kwargs, + learning_rate=0.2, + warmup_steps=num_warmup_steps, + report_to="none", + ) + trainer = Trainer(model, args, train_dataset=train_dataset) + trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) - # Checking that the scheduler was created - self.assertIsNotNone(trainer.lr_scheduler) + # Checking that the scheduler was created + self.assertIsNotNone(trainer.lr_scheduler) - # Checking that the correct args were passed - sched1 = trainer.lr_scheduler - sched2 = get_polynomial_decay_schedule_with_warmup( - trainer.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_steps, **extra_kwargs - ) - self.assertEqual(sched1.lr_lambdas[0].args, sched2.lr_lambdas[0].args) - self.assertEqual(sched1.lr_lambdas[0].keywords, sched2.lr_lambdas[0].keywords) + # Checking that the correct args were passed + sched1 = trainer.lr_scheduler + sched2 = get_polynomial_decay_schedule_with_warmup( + trainer.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_steps, **extra_kwargs + ) + self.assertEqual(sched1.lr_lambdas[0].args, sched2.lr_lambdas[0].args) + self.assertEqual(sched1.lr_lambdas[0].keywords, sched2.lr_lambdas[0].keywords) def test_cosine_with_min_lr_scheduler(self): train_dataset = RegressionDataset() model = RegressionModel() num_steps, num_warmup_steps = 10, 2 extra_kwargs = {"min_lr": 1e-5} # Non-default arguments - args = TrainingArguments( - "./regression", - lr_scheduler_type="cosine_with_min_lr", - lr_scheduler_kwargs=extra_kwargs, - learning_rate=0.2, - warmup_steps=num_warmup_steps, - report_to="none", - ) - trainer = Trainer(model, args, train_dataset=train_dataset) - trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + lr_scheduler_type="cosine_with_min_lr", + lr_scheduler_kwargs=extra_kwargs, + learning_rate=0.2, + warmup_steps=num_warmup_steps, + report_to="none", + ) + trainer = Trainer(model, args, train_dataset=train_dataset) + trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) - # Checking that the scheduler was created - self.assertIsNotNone(trainer.lr_scheduler) + # Checking that the scheduler was created + self.assertIsNotNone(trainer.lr_scheduler) - # Check the last learning rate - for _ in range(num_steps): - trainer.lr_scheduler.step() - self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5) + # Check the last learning rate + for _ in range(num_steps): + trainer.lr_scheduler.step() + self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5) def test_reduce_lr_on_plateau_args(self): # test passed arguments for a custom ReduceLROnPlateau scheduler train_dataset = RegressionDataset(length=64) eval_dataset = RegressionDataset(length=64) - args = TrainingArguments( - "./regression", - eval_strategy="epoch", - metric_for_best_model="eval_loss", - report_to="none", - ) - model = RegressionModel() - optimizer = torch.optim.SGD(model.parameters(), lr=1.0) - lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, cooldown=2) - trainer = Trainer( - model, args, train_dataset=train_dataset, eval_dataset=eval_dataset, optimizers=(optimizer, lr_scheduler) - ) - trainer.train() + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + eval_strategy="epoch", + metric_for_best_model="eval_loss", + report_to="none", + ) + model = RegressionModel() + optimizer = torch.optim.SGD(model.parameters(), lr=1.0) + lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, cooldown=2) + trainer = Trainer( + model, + args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + optimizers=(optimizer, lr_scheduler), + ) + trainer.train() - self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) - self.assertEqual(trainer.lr_scheduler.factor, 0.2) - self.assertEqual(trainer.lr_scheduler.patience, 5) - self.assertEqual(trainer.lr_scheduler.cooldown, 2) + self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) + self.assertEqual(trainer.lr_scheduler.factor, 0.2) + self.assertEqual(trainer.lr_scheduler.patience, 5) + self.assertEqual(trainer.lr_scheduler.cooldown, 2) def test_reduce_lr_on_plateau(self): # test the ReduceLROnPlateau scheduler @@ -1087,39 +1121,40 @@ def log(self, logs): train_dataset = RegressionDataset(length=64) eval_dataset = RegressionDataset(length=64) - args = TrainingArguments( - "./regression", - lr_scheduler_type="reduce_lr_on_plateau", - eval_strategy="epoch", - metric_for_best_model="eval_loss", - num_train_epochs=10, - learning_rate=0.2, - report_to="none", - ) - model = RegressionModel() - trainer = TrainerWithLRLogs(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) - trainer.train() + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + lr_scheduler_type="reduce_lr_on_plateau", + eval_strategy="epoch", + metric_for_best_model="eval_loss", + num_train_epochs=10, + learning_rate=0.2, + report_to="none", + ) + model = RegressionModel() + trainer = TrainerWithLRLogs(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) + trainer.train() - self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) - patience = trainer.lr_scheduler.patience - - logs = trainer.state.log_history[1:] - best_loss = logs[0]["eval_loss"] - bad_epochs = 0 - for i, log in enumerate(logs[:-1]): # Compare learning rate to next epoch's - loss = log["eval_loss"] - just_decreased = False - if loss > best_loss: - bad_epochs += 1 - if bad_epochs > patience: - self.assertLess(logs[i + 1]["learning_rate"], log["learning_rate"]) - just_decreased = True + self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) + patience = trainer.lr_scheduler.patience + + logs = trainer.state.log_history[1:] + best_loss = logs[0]["eval_loss"] + bad_epochs = 0 + for i, log in enumerate(logs[:-1]): # Compare learning rate to next epoch's + loss = log["eval_loss"] + just_decreased = False + if loss > best_loss: + bad_epochs += 1 + if bad_epochs > patience: + self.assertLess(logs[i + 1]["learning_rate"], log["learning_rate"]) + just_decreased = True + bad_epochs = 0 + else: + best_loss = loss bad_epochs = 0 - else: - best_loss = loss - bad_epochs = 0 - if not just_decreased: - self.assertEqual(logs[i + 1]["learning_rate"], log["learning_rate"]) + if not just_decreased: + self.assertEqual(logs[i + 1]["learning_rate"], log["learning_rate"]) def test_adafactor_lr_none(self): # test the special case where lr=None, since Trainer can't not have lr_scheduler @@ -1127,29 +1162,36 @@ def test_adafactor_lr_none(self): from transformers.optimization import Adafactor, AdafactorSchedule train_dataset = RegressionDataset() - args = TrainingArguments("./regression", report_to="none") - model = RegressionModel() - optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None) - lr_scheduler = AdafactorSchedule(optimizer) - trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)) - trainer.train() + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(tmp_dir, report_to="none") + model = RegressionModel() + optimizer = Adafactor( + model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None + ) + lr_scheduler = AdafactorSchedule(optimizer) + trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)) + trainer.train() - (a, b) = self.default_trained_model - self.assertFalse(torch.allclose(trainer.model.a, a)) - self.assertFalse(torch.allclose(trainer.model.b, b)) - self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0) + (a, b) = self.default_trained_model + self.assertFalse(torch.allclose(trainer.model.a, a)) + self.assertFalse(torch.allclose(trainer.model.b, b)) + self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0) @require_torch_accelerator @require_torch_bf16 def test_mixed_bf16(self): # very basic test - trainer = get_regression_trainer(learning_rate=0.1, bf16=True) - trainer.train() - self.check_trained_model(trainer.model) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(learning_rate=0.1, bf16=True, output_dir=tmp_dir) + trainer.train() + self.check_trained_model(trainer.model) # --bf16 --half_precision_backend apex can't be used together - with self.assertRaises(ValueError): - trainer = get_regression_trainer(learning_rate=0.1, bf16=True, half_precision_backend="apex") + with tempfile.TemporaryDirectory() as tmp_dir: + with self.assertRaises(ValueError): + trainer = get_regression_trainer( + learning_rate=0.1, bf16=True, half_precision_backend="apex", output_dir=tmp_dir + ) # will add more specific tests once there are some bugs to fix @@ -1158,9 +1200,10 @@ def test_mixed_bf16(self): @require_torch_tf32 def test_tf32(self): # very basic test - trainer = get_regression_trainer(learning_rate=0.1, tf32=True) - trainer.train() - self.check_trained_model(trainer.model) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(learning_rate=0.1, tf32=True, output_dir=tmp_dir) + trainer.train() + self.check_trained_model(trainer.model) @require_torch @@ -1179,75 +1222,87 @@ def test_trainer_works_with_dict(self): train_dataset = RegressionDataset() eval_dataset = RegressionDataset() model = RegressionDictModel() - args = TrainingArguments("./regression", report_to="none") - trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) - trainer.train() - _ = trainer.evaluate() - _ = trainer.predict(eval_dataset) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(tmp_dir, report_to="none") + trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) + trainer.train() + _ = trainer.evaluate() + _ = trainer.predict(eval_dataset) def test_evaluation_with_keys_to_drop(self): config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) tiny_gpt2 = GPT2LMHeadModel(config) x = torch.randint(0, 100, (128,)) eval_dataset = RepeatDataset(x) - args = TrainingArguments("./test", report_to="none") - trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset) - # By default the past_key_values are removed - result = trainer.predict(eval_dataset) - self.assertTrue(isinstance(result.predictions, np.ndarray)) - # We can still get them by setting ignore_keys to [] - result = trainer.predict(eval_dataset, ignore_keys=[]) - self.assertTrue(isinstance(result.predictions, tuple)) - self.assertEqual(len(result.predictions), 2) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(tmp_dir, report_to="none") + trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset) + # By default the past_key_values are removed + result = trainer.predict(eval_dataset) + self.assertTrue(isinstance(result.predictions, np.ndarray)) + # We can still get them by setting ignore_keys to [] + result = trainer.predict(eval_dataset, ignore_keys=[]) + self.assertTrue(isinstance(result.predictions, tuple)) + self.assertEqual(len(result.predictions), 2) def test_training_arguments_are_left_untouched(self): - trainer = get_regression_trainer() - trainer.train() - args = TrainingArguments("./regression", report_to=[]) - dict1, dict2 = args.to_dict(), trainer.args.to_dict() - for key in dict1.keys(): - # Logging dir can be slightly different as they default to something with the time. - if key != "logging_dir": - self.assertEqual(dict1[key], dict2[key]) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(output_dir=tmp_dir) + trainer.train() + args = TrainingArguments(tmp_dir, report_to=[]) + dict1, dict2 = args.to_dict(), trainer.args.to_dict() + for key in dict1.keys(): + # Logging dir can be slightly different as they default to something with the time. + if key != "logging_dir": + self.assertEqual(dict1[key], dict2[key]) def test_number_of_steps_in_training(self): # Regular training has n_epochs * len(train_dl) steps - trainer = get_regression_trainer(learning_rate=0.1) - train_output = trainer.train() - self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir) + train_output = trainer.train() + self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size) - # Check passing num_train_epochs works (and a float version too): - trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5) - train_output = trainer.train() - self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size)) + # Check passing num_train_epochs works (and a float version too): + trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5, output_dir=tmp_dir) + train_output = trainer.train() + self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size)) - # If we pass a max_steps, num_train_epochs is ignored - trainer = get_regression_trainer(learning_rate=0.1, max_steps=10) - train_output = trainer.train() - self.assertEqual(train_output.global_step, 10) + # If we pass a max_steps, num_train_epochs is ignored + trainer = get_regression_trainer(learning_rate=0.1, max_steps=10, output_dir=tmp_dir) + train_output = trainer.train() + self.assertEqual(train_output.global_step, 10) @require_torch_bf16 @require_intel_extension_for_pytorch def test_number_of_steps_in_training_with_ipex(self): for mix_bf16 in [True, False]: - # Regular training has n_epochs * len(train_dl) steps - trainer = get_regression_trainer(learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True) - train_output = trainer.train() - self.assertEqual(train_output.global_step, self.n_epochs * 64 / trainer.args.train_batch_size) + with tempfile.TemporaryDirectory() as tmp_dir: + # Regular training has n_epochs * len(train_dl) steps + trainer = get_regression_trainer( + learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir + ) + train_output = trainer.train() + self.assertEqual(train_output.global_step, self.n_epochs * 64 / trainer.args.train_batch_size) - # Check passing num_train_epochs works (and a float version too): - trainer = get_regression_trainer( - learning_rate=0.1, num_train_epochs=1.5, use_ipex=True, bf16=mix_bf16, use_cpu=True - ) - train_output = trainer.train() - self.assertEqual(train_output.global_step, int(1.5 * 64 / trainer.args.train_batch_size)) + # Check passing num_train_epochs works (and a float version too): + trainer = get_regression_trainer( + learning_rate=0.1, + num_train_epochs=1.5, + use_ipex=True, + bf16=mix_bf16, + use_cpu=True, + output_dir=tmp_dir, + ) + train_output = trainer.train() + self.assertEqual(train_output.global_step, int(1.5 * 64 / trainer.args.train_batch_size)) - # If we pass a max_steps, num_train_epochs is ignored - trainer = get_regression_trainer( - learning_rate=0.1, max_steps=10, use_ipex=True, bf16=mix_bf16, use_cpu=True - ) - train_output = trainer.train() - self.assertEqual(train_output.global_step, 10) + # If we pass a max_steps, num_train_epochs is ignored + trainer = get_regression_trainer( + learning_rate=0.1, max_steps=10, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir + ) + train_output = trainer.train() + self.assertEqual(train_output.global_step, 10) def test_torch_compile_loss_func_compatibility(self): config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) @@ -1450,52 +1505,54 @@ def test_neftune(self): train_dataset = RepeatDataset(x) # Trainer without inf/nan filter - args = TrainingArguments( - "./test", - learning_rate=1e-9, - logging_steps=5, - logging_nan_inf_filter=False, - neftune_noise_alpha=0.4, - report_to="none", - ) - trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + learning_rate=1e-9, + logging_steps=5, + logging_nan_inf_filter=False, + neftune_noise_alpha=0.4, + report_to="none", + ) + trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) - trainer.model = trainer._activate_neftune(trainer.model) + trainer.model = trainer._activate_neftune(trainer.model) - dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device) + dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device) - emb1 = trainer.model.get_input_embeddings()(dummy_input) - emb2 = trainer.model.get_input_embeddings()(dummy_input) + emb1 = trainer.model.get_input_embeddings()(dummy_input) + emb2 = trainer.model.get_input_embeddings()(dummy_input) - self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!") + self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!") # redefine the model tiny_gpt2 = GPT2LMHeadModel(config) # Trainer without inf/nan filter - args = TrainingArguments( - "./test", - learning_rate=1e-9, - logging_steps=5, - logging_nan_inf_filter=False, - neftune_noise_alpha=0.4, - report_to="none", - ) - trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + learning_rate=1e-9, + logging_steps=5, + logging_nan_inf_filter=False, + neftune_noise_alpha=0.4, + report_to="none", + ) + trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) - # Check that it trains without errors - trainer.train() + # Check that it trains without errors + trainer.train() - # Make sure forward pass works fine - _ = trainer.model(dummy_input) - self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0) + # Make sure forward pass works fine + _ = trainer.model(dummy_input) + self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0) - trainer.model.eval() + trainer.model.eval() - # Check that we get identical embeddings just in case - emb1 = trainer.model.get_input_embeddings()(dummy_input) - emb2 = trainer.model.get_input_embeddings()(dummy_input) + # Check that we get identical embeddings just in case + emb1 = trainer.model.get_input_embeddings()(dummy_input) + emb2 = trainer.model.get_input_embeddings()(dummy_input) - self.assertTrue(torch.allclose(emb1, emb2), "Neftune noise is still applied!") + self.assertTrue(torch.allclose(emb1, emb2), "Neftune noise is still applied!") def test_logging_inf_nan_filter(self): config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) @@ -1504,59 +1561,69 @@ def test_logging_inf_nan_filter(self): train_dataset = RepeatDataset(x) # Trainer without inf/nan filter - args = TrainingArguments( - "./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False, report_to="none" - ) - trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) - trainer.train() - log_history_no_filter = trainer.state.log_history + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False, report_to="none" + ) + trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) + trainer.train() + log_history_no_filter = trainer.state.log_history # Trainer with inf/nan filter - args = TrainingArguments( - "./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True, report_to="none" - ) - trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) - trainer.train() - log_history_filter = trainer.state.log_history + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True, report_to="none" + ) + trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) + trainer.train() + log_history_filter = trainer.state.log_history - def is_any_loss_nan_or_inf(log_history): - losses = [l["loss"] for l in log_history[:-1]] - return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses) + def is_any_loss_nan_or_inf(log_history): + losses = [l["loss"] for l in log_history[:-1]] + return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses) - self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter)) - self.assertFalse(is_any_loss_nan_or_inf(log_history_filter)) + self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter)) + self.assertFalse(is_any_loss_nan_or_inf(log_history_filter)) def test_train_and_eval_dataloaders(self): if torch_device == "cuda": n_gpu = max(1, backend_device_count(torch_device)) else: n_gpu = 1 - trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16) - self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu) - trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16) - self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16 * n_gpu) - - # Check drop_last works - trainer = get_regression_trainer( - train_len=66, eval_len=74, learning_rate=0.1, per_device_train_batch_size=16, per_device_eval_batch_size=32 - ) - self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu) + 1) - self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu) + 1) - trainer = get_regression_trainer( - train_len=66, - eval_len=74, - learning_rate=0.1, - per_device_train_batch_size=16, - per_device_eval_batch_size=32, - dataloader_drop_last=True, - ) - self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu)) - self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu)) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16, output_dir=tmp_dir) + self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu) + trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16, output_dir=tmp_dir) + self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16 * n_gpu) + + # Check drop_last works + trainer = get_regression_trainer( + train_len=66, + eval_len=74, + learning_rate=0.1, + per_device_train_batch_size=16, + per_device_eval_batch_size=32, + output_dir=tmp_dir, + ) + self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu) + 1) + self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu) + 1) + + trainer = get_regression_trainer( + train_len=66, + eval_len=74, + learning_rate=0.1, + per_device_train_batch_size=16, + per_device_eval_batch_size=32, + dataloader_drop_last=True, + output_dir=tmp_dir, + ) + self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu)) + self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu)) - # Check passing a new dataset for evaluation works - new_eval_dataset = RegressionDataset(length=128) - self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32 * n_gpu)) + # Check passing a new dataset for evaluation works + new_eval_dataset = RegressionDataset(length=128) + self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32 * n_gpu)) # tests that we do not require dataloader to have a .dataset attribute def test_dataloader_without_dataset(self): @@ -1576,92 +1643,94 @@ def test_get_eval_dataloader_without_persistent_workers(self): train_dataset = RegressionDataset() config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) tiny_gpt2 = GPT2LMHeadModel(config) - args = TrainingArguments("./test", report_to="none", dataloader_persistent_workers=False) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(tmp_dir, report_to="none", dataloader_persistent_workers=False) - # Single evaluation dataset - eval_dataset = RegressionDataset() - trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset) - # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader - trainer.accelerator.prepare = lambda x: x - - default_dataloader = trainer.get_eval_dataloader() - dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset) - - self.assertEqual(default_dataloader.dataset, eval_dataset) - self.assertEqual(dataloader_with_dataset.dataset, eval_dataset) - self.assertNotEqual(default_dataloader, dataloader_with_dataset) - - # Multiple evaluation datasets - first_dataset = RegressionDataset() - second_dataset = RegressionDataset() - trainer = Trainer( - tiny_gpt2, - args, - train_dataset=train_dataset, - eval_dataset={"first": first_dataset, "second": second_dataset}, - ) - # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader - trainer.accelerator.prepare = lambda x: x + # Single evaluation dataset + eval_dataset = RegressionDataset() + trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset) + # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader + trainer.accelerator.prepare = lambda x: x + + default_dataloader = trainer.get_eval_dataloader() + dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset) + + self.assertEqual(default_dataloader.dataset, eval_dataset) + self.assertEqual(dataloader_with_dataset.dataset, eval_dataset) + self.assertNotEqual(default_dataloader, dataloader_with_dataset) - first_dataloader = trainer.get_eval_dataloader("first") - first_dataloader_repeated = trainer.get_eval_dataloader("first") - second_dataloader = trainer.get_eval_dataloader("second") - second_dataloader_repeated = trainer.get_eval_dataloader("second") + # Multiple evaluation datasets + first_dataset = RegressionDataset() + second_dataset = RegressionDataset() + trainer = Trainer( + tiny_gpt2, + args, + train_dataset=train_dataset, + eval_dataset={"first": first_dataset, "second": second_dataset}, + ) + # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader + trainer.accelerator.prepare = lambda x: x + + first_dataloader = trainer.get_eval_dataloader("first") + first_dataloader_repeated = trainer.get_eval_dataloader("first") + second_dataloader = trainer.get_eval_dataloader("second") + second_dataloader_repeated = trainer.get_eval_dataloader("second") - self.assertEqual(first_dataset, first_dataloader.dataset) - self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset) - self.assertEqual(second_dataset, second_dataloader.dataset) - self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset) - self.assertNotEqual(first_dataloader, first_dataloader_repeated) - self.assertNotEqual(second_dataloader, second_dataloader_repeated) + self.assertEqual(first_dataset, first_dataloader.dataset) + self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset) + self.assertEqual(second_dataset, second_dataloader.dataset) + self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset) + self.assertNotEqual(first_dataloader, first_dataloader_repeated) + self.assertNotEqual(second_dataloader, second_dataloader_repeated) def test_get_eval_dataloader_with_persistent_workers(self): train_dataset = RegressionDataset() config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) tiny_gpt2 = GPT2LMHeadModel(config) - args = TrainingArguments( - "./test", - report_to="none", - dataloader_persistent_workers=True, - dataloader_num_workers=2, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + report_to="none", + dataloader_persistent_workers=True, + dataloader_num_workers=2, + ) - # Single evaluation dataset - eval_dataset = RegressionDataset() - trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset) - # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader - trainer.accelerator.prepare = lambda x: x - - default_dataloader = trainer.get_eval_dataloader() - dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset) - - self.assertEqual(default_dataloader.dataset, eval_dataset) - self.assertEqual(dataloader_with_dataset.dataset, eval_dataset) - self.assertEqual(default_dataloader, dataloader_with_dataset) - - # Multiple evaluation datasets - first_dataset = RegressionDataset() - second_dataset = RegressionDataset() - trainer = Trainer( - tiny_gpt2, - args, - train_dataset=train_dataset, - eval_dataset={"first": first_dataset, "second": second_dataset}, - ) - # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader - trainer.accelerator.prepare = lambda x: x + # Single evaluation dataset + eval_dataset = RegressionDataset() + trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset) + # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader + trainer.accelerator.prepare = lambda x: x + + default_dataloader = trainer.get_eval_dataloader() + dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset) + + self.assertEqual(default_dataloader.dataset, eval_dataset) + self.assertEqual(dataloader_with_dataset.dataset, eval_dataset) + self.assertEqual(default_dataloader, dataloader_with_dataset) + + # Multiple evaluation datasets + first_dataset = RegressionDataset() + second_dataset = RegressionDataset() + trainer = Trainer( + tiny_gpt2, + args, + train_dataset=train_dataset, + eval_dataset={"first": first_dataset, "second": second_dataset}, + ) + # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader + trainer.accelerator.prepare = lambda x: x - first_dataloader = trainer.get_eval_dataloader("first") - first_dataloader_repeated = trainer.get_eval_dataloader("first") - second_dataloader = trainer.get_eval_dataloader("second") - second_dataloader_repeated = trainer.get_eval_dataloader("second") + first_dataloader = trainer.get_eval_dataloader("first") + first_dataloader_repeated = trainer.get_eval_dataloader("first") + second_dataloader = trainer.get_eval_dataloader("second") + second_dataloader_repeated = trainer.get_eval_dataloader("second") - self.assertEqual(first_dataset, first_dataloader.dataset) - self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset) - self.assertEqual(second_dataset, second_dataloader.dataset) - self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset) - self.assertEqual(first_dataloader, first_dataloader_repeated) - self.assertEqual(second_dataloader, second_dataloader_repeated) + self.assertEqual(first_dataset, first_dataloader.dataset) + self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset) + self.assertEqual(second_dataset, second_dataloader.dataset) + self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset) + self.assertEqual(first_dataloader, first_dataloader_repeated) + self.assertEqual(second_dataloader, second_dataloader_repeated) @require_liger_kernel def test_use_liger_kernel_patching(self): @@ -1678,15 +1747,16 @@ def test_use_liger_kernel_patching(self): self.assertNotEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb) self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm)) - args = TrainingArguments( - "./test", - use_liger_kernel=True, - ) - Trainer(tiny_llama, args) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + use_liger_kernel=True, + ) + Trainer(tiny_llama, args) - # Spot check that modeling code and model instance variables are patched - self.assertEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb) - self.assertTrue(isinstance(tiny_llama.model.norm, LigerRMSNorm)) + # Spot check that modeling code and model instance variables are patched + self.assertEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb) + self.assertTrue(isinstance(tiny_llama.model.norm, LigerRMSNorm)) @require_liger_kernel @require_torch_gpu @@ -2162,148 +2232,67 @@ def test_data_is_not_parallelized_when_model_is_parallel(self): # Make the Trainer believe it's a parallelized model model.is_parallelizable = True model.model_parallel = True - args = TrainingArguments( - "./regression", per_device_train_batch_size=16, per_device_eval_batch_size=16, report_to="none" - ) - trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset()) - # Check the Trainer was fooled - self.assertTrue(trainer.is_model_parallel) - self.assertEqual(trainer.args.n_gpu, 1) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, per_device_train_batch_size=16, per_device_eval_batch_size=16, report_to="none" + ) + trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset()) + # Check the Trainer was fooled + self.assertTrue(trainer.is_model_parallel) + self.assertEqual(trainer.args.n_gpu, 1) - # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu - self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16) - self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16) - self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16) - self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16) + # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu + self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16) + self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16) + self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16) + self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16) def test_evaluate(self): - trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy()) - results = trainer.evaluate() - - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - - # With a number of elements not a round multiple of the batch size - trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy()) - results = trainer.evaluate() - - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - - # With logits preprocess - trainer = get_regression_trainer( - a=1.5, - b=2.5, - compute_metrics=AlmostAccuracy(), - preprocess_logits_for_metrics=lambda logits, labels: logits + 1, - ) - results = trainer.evaluate() + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), output_dir=tmp_dir) + results = trainer.evaluate() - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - def test_evaluate_with_batch_eval_metrics(self): - trainer = get_regression_trainer( - a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True - ) - results = trainer.evaluate() - - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - - # With a number of elements not a round multiple of the batch size - trainer = get_regression_trainer( - a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True - ) - results = trainer.evaluate() - - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - - # With logits preprocess - trainer = get_regression_trainer( - a=1.5, - b=2.5, - compute_metrics=AlmostAccuracyBatched(), - batch_eval_metrics=True, - preprocess_logits_for_metrics=lambda logits, labels: logits + 1, - ) - results = trainer.evaluate() + # With a number of elements not a round multiple of the batch size + trainer = get_regression_trainer( + a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), output_dir=tmp_dir + ) + results = trainer.evaluate() - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - def test_evaluate_with_jit(self): - trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), jit_mode_eval=True) - results = trainer.evaluate() - - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - - # With a number of elements not a round multiple of the batch size - trainer = get_regression_trainer( - a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), jit_mode_eval=True - ) - results = trainer.evaluate() - - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - - # With logits preprocess - trainer = get_regression_trainer( - a=1.5, - b=2.5, - compute_metrics=AlmostAccuracy(), - preprocess_logits_for_metrics=lambda logits, labels: logits + 1, - jit_mode_eval=True, - ) - results = trainer.evaluate() + # With logits preprocess + trainer = get_regression_trainer( + a=1.5, + b=2.5, + compute_metrics=AlmostAccuracy(), + preprocess_logits_for_metrics=lambda logits, labels: logits + 1, + output_dir=tmp_dir, + ) + results = trainer.evaluate() - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - @require_torch_bf16 - @require_intel_extension_for_pytorch - def test_evaluate_with_ipex(self): - for mix_bf16 in [True, False]: + def test_evaluate_with_batch_eval_metrics(self): + with tempfile.TemporaryDirectory() as tmp_dir: trainer = get_regression_trainer( - a=1.5, b=2.5, use_ipex=True, compute_metrics=AlmostAccuracy(), bf16=mix_bf16, use_cpu=True + a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir ) results = trainer.evaluate() @@ -2318,11 +2307,10 @@ def test_evaluate_with_ipex(self): trainer = get_regression_trainer( a=1.5, b=2.5, - use_ipex=True, eval_len=66, - compute_metrics=AlmostAccuracy(), - bf16=mix_bf16, - use_cpu=True, + compute_metrics=AlmostAccuracyBatched(), + batch_eval_metrics=True, + output_dir=tmp_dir, ) results = trainer.evaluate() @@ -2337,11 +2325,10 @@ def test_evaluate_with_ipex(self): trainer = get_regression_trainer( a=1.5, b=2.5, - use_ipex=True, - compute_metrics=AlmostAccuracy(), + compute_metrics=AlmostAccuracyBatched(), + batch_eval_metrics=True, preprocess_logits_for_metrics=lambda logits, labels: logits + 1, - bf16=mix_bf16, - use_cpu=True, + output_dir=tmp_dir, ) results = trainer.evaluate() @@ -2352,143 +2339,228 @@ def test_evaluate_with_ipex(self): expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - def test_predict(self): - trainer = get_regression_trainer(a=1.5, b=2.5) - preds = trainer.predict(trainer.eval_dataset).predictions - x = trainer.eval_dataset.x - self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) - - # With a number of elements not a round multiple of the batch size - trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66) - preds = trainer.predict(trainer.eval_dataset).predictions - x = trainer.eval_dataset.x - self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) - - # With more than one output of the model - trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True) - preds = trainer.predict(trainer.eval_dataset).predictions - x = trainer.eval_dataset.x - self.assertEqual(len(preds), 2) - self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) - self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) - - # With more than one output/label of the model - trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"]) - outputs = trainer.predict(trainer.eval_dataset) - preds = outputs.predictions - labels = outputs.label_ids - x = trainer.eval_dataset.x - self.assertEqual(len(preds), 2) - self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) - self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) - self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) - self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) + def test_evaluate_with_jit(self): + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), jit_mode_eval=True, output_dir=tmp_dir + ) + results = trainer.evaluate() - def test_predict_with_batch_eval_metrics(self): - trainer = get_regression_trainer( - a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True - ) - results = trainer.predict(trainer.eval_dataset) - preds = results.predictions - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - gt = 1.5 * x + 2.5 - self.assertTrue(np.allclose(preds, gt)) - expected_acc = AlmostAccuracy()((preds, y))["accuracy"] - self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc) - - # With a number of elements not a round multiple of the batch size - trainer = get_regression_trainer( - a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True - ) - results = trainer.predict(trainer.eval_dataset) - preds = results.predictions - x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] - self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) - expected_acc = AlmostAccuracy()((preds, y))["accuracy"] - self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc) - - # With more than one output of the model - trainer = get_regression_trainer( - a=1.5, b=2.5, double_output=True, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True - ) - preds = trainer.predict(trainer.eval_dataset).predictions - x = trainer.eval_dataset.x - self.assertEqual(len(preds), 2) - self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) - self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) - - # With more than one output/label of the model - trainer = get_regression_trainer( - a=1.5, - b=2.5, - double_output=True, - label_names=["labels", "labels_2"], - compute_metrics=AlmostAccuracyBatched(), - batch_eval_metrics=True, - ) - outputs = trainer.predict(trainer.eval_dataset) - preds = outputs.predictions - labels = outputs.label_ids - x = trainer.eval_dataset.x - self.assertEqual(len(preds), 2) - self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) - self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) - self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) - self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - def test_predict_with_jit(self): - trainer = get_regression_trainer(a=1.5, b=2.5, jit_mode_eval=True) - preds = trainer.predict(trainer.eval_dataset).predictions - x = trainer.eval_dataset.x - self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) - - # With a number of elements not a round multiple of the batch size - trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, jit_mode_eval=True) - preds = trainer.predict(trainer.eval_dataset).predictions - x = trainer.eval_dataset.x - self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) - - # With more than one output of the model - trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, jit_mode_eval=True) - preds = trainer.predict(trainer.eval_dataset).predictions - x = trainer.eval_dataset.x - self.assertEqual(len(preds), 2) - self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) - self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) - - # With more than one output/label of the model - trainer = get_regression_trainer( - a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"], jit_mode_eval=True - ) - outputs = trainer.predict(trainer.eval_dataset) - preds = outputs.predictions - labels = outputs.label_ids - x = trainer.eval_dataset.x - self.assertEqual(len(preds), 2) - self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) - self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) - self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) - self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) + # With a number of elements not a round multiple of the batch size + trainer = get_regression_trainer( + a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), jit_mode_eval=True, output_dir=tmp_dir + ) + results = trainer.evaluate() + + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + + # With logits preprocess + trainer = get_regression_trainer( + a=1.5, + b=2.5, + compute_metrics=AlmostAccuracy(), + preprocess_logits_for_metrics=lambda logits, labels: logits + 1, + jit_mode_eval=True, + output_dir=tmp_dir, + ) + results = trainer.evaluate() + + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) @require_torch_bf16 @require_intel_extension_for_pytorch - def test_predict_with_ipex(self): + def test_evaluate_with_ipex(self): for mix_bf16 in [True, False]: - trainer = get_regression_trainer(a=1.5, b=2.5, use_ipex=True, bf16=mix_bf16, use_cpu=True) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + a=1.5, + b=2.5, + use_ipex=True, + compute_metrics=AlmostAccuracy(), + bf16=mix_bf16, + use_cpu=True, + output_dir=tmp_dir, + ) + results = trainer.evaluate() + + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + + # With a number of elements not a round multiple of the batch size + trainer = get_regression_trainer( + a=1.5, + b=2.5, + use_ipex=True, + eval_len=66, + compute_metrics=AlmostAccuracy(), + bf16=mix_bf16, + use_cpu=True, + output_dir=tmp_dir, + ) + results = trainer.evaluate() + + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + + # With logits preprocess + trainer = get_regression_trainer( + a=1.5, + b=2.5, + use_ipex=True, + compute_metrics=AlmostAccuracy(), + preprocess_logits_for_metrics=lambda logits, labels: logits + 1, + bf16=mix_bf16, + use_cpu=True, + output_dir=tmp_dir, + ) + results = trainer.evaluate() + + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + + def test_predict(self): + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(a=1.5, b=2.5, output_dir=tmp_dir) + preds = trainer.predict(trainer.eval_dataset).predictions + x = trainer.eval_dataset.x + self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) + + # With a number of elements not a round multiple of the batch size + trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, output_dir=tmp_dir) + preds = trainer.predict(trainer.eval_dataset).predictions + x = trainer.eval_dataset.x + self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) + + # With more than one output of the model + trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, output_dir=tmp_dir) + preds = trainer.predict(trainer.eval_dataset).predictions + x = trainer.eval_dataset.x + self.assertEqual(len(preds), 2) + self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) + self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) + + # With more than one output/label of the model + trainer = get_regression_trainer( + a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"], output_dir=tmp_dir + ) + outputs = trainer.predict(trainer.eval_dataset) + preds = outputs.predictions + labels = outputs.label_ids + x = trainer.eval_dataset.x + self.assertEqual(len(preds), 2) + self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) + self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) + self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) + self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) + + def test_predict_with_batch_eval_metrics(self): + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir + ) + results = trainer.predict(trainer.eval_dataset) + preds = results.predictions + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + gt = 1.5 * x + 2.5 + self.assertTrue(np.allclose(preds, gt)) + expected_acc = AlmostAccuracy()((preds, y))["accuracy"] + self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc) + + # With a number of elements not a round multiple of the batch size + trainer = get_regression_trainer( + a=1.5, + b=2.5, + eval_len=66, + compute_metrics=AlmostAccuracyBatched(), + batch_eval_metrics=True, + output_dir=tmp_dir, + ) + results = trainer.predict(trainer.eval_dataset) + preds = results.predictions + x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] + self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) + expected_acc = AlmostAccuracy()((preds, y))["accuracy"] + self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc) + + # With more than one output of the model + trainer = get_regression_trainer( + a=1.5, + b=2.5, + double_output=True, + compute_metrics=AlmostAccuracyBatched(), + batch_eval_metrics=True, + output_dir=tmp_dir, + ) + preds = trainer.predict(trainer.eval_dataset).predictions + x = trainer.eval_dataset.x + self.assertEqual(len(preds), 2) + self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) + self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) + + # With more than one output/label of the model + trainer = get_regression_trainer( + a=1.5, + b=2.5, + double_output=True, + label_names=["labels", "labels_2"], + compute_metrics=AlmostAccuracyBatched(), + batch_eval_metrics=True, + output_dir=tmp_dir, + ) + outputs = trainer.predict(trainer.eval_dataset) + preds = outputs.predictions + labels = outputs.label_ids + x = trainer.eval_dataset.x + self.assertEqual(len(preds), 2) + self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) + self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) + self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) + self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) + + def test_predict_with_jit(self): + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(a=1.5, b=2.5, jit_mode_eval=True, output_dir=tmp_dir) preds = trainer.predict(trainer.eval_dataset).predictions x = trainer.eval_dataset.x self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) # With a number of elements not a round multiple of the batch size - trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, use_ipex=True, bf16=mix_bf16, use_cpu=True) + trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, jit_mode_eval=True, output_dir=tmp_dir) preds = trainer.predict(trainer.eval_dataset).predictions x = trainer.eval_dataset.x self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) # With more than one output of the model - trainer = get_regression_trainer( - a=1.5, b=2.5, double_output=True, use_ipex=True, bf16=mix_bf16, use_cpu=True - ) + trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, jit_mode_eval=True, output_dir=tmp_dir) preds = trainer.predict(trainer.eval_dataset).predictions x = trainer.eval_dataset.x self.assertEqual(len(preds), 2) @@ -2501,9 +2573,8 @@ def test_predict_with_ipex(self): b=2.5, double_output=True, label_names=["labels", "labels_2"], - use_ipex=True, - bf16=mix_bf16, - use_cpu=True, + jit_mode_eval=True, + output_dir=tmp_dir, ) outputs = trainer.predict(trainer.eval_dataset) preds = outputs.predictions @@ -2515,41 +2586,94 @@ def test_predict_with_ipex(self): self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) + @require_torch_bf16 + @require_intel_extension_for_pytorch + def test_predict_with_ipex(self): + for mix_bf16 in [True, False]: + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + a=1.5, b=2.5, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir + ) + preds = trainer.predict(trainer.eval_dataset).predictions + x = trainer.eval_dataset.x + self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) + + # With a number of elements not a round multiple of the batch size + trainer = get_regression_trainer( + a=1.5, b=2.5, eval_len=66, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir + ) + preds = trainer.predict(trainer.eval_dataset).predictions + x = trainer.eval_dataset.x + self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) + + # With more than one output of the model + trainer = get_regression_trainer( + a=1.5, b=2.5, double_output=True, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir + ) + preds = trainer.predict(trainer.eval_dataset).predictions + x = trainer.eval_dataset.x + self.assertEqual(len(preds), 2) + self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) + self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) + + # With more than one output/label of the model + trainer = get_regression_trainer( + a=1.5, + b=2.5, + double_output=True, + label_names=["labels", "labels_2"], + use_ipex=True, + bf16=mix_bf16, + use_cpu=True, + output_dir=tmp_dir, + ) + outputs = trainer.predict(trainer.eval_dataset) + preds = outputs.predictions + labels = outputs.label_ids + x = trainer.eval_dataset.x + self.assertEqual(len(preds), 2) + self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) + self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) + self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) + self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) + def test_dynamic_shapes(self): eval_dataset = DynamicShapesDataset(batch_size=self.batch_size) model = RegressionModel(a=2, b=1) - args = TrainingArguments("./regression", report_to="none") - trainer = Trainer(model, args, eval_dataset=eval_dataset) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(tmp_dir, report_to="none") + trainer = Trainer(model, args, eval_dataset=eval_dataset) - # Check evaluation can run to completion - _ = trainer.evaluate() + # Check evaluation can run to completion + _ = trainer.evaluate() - # Check predictions - preds = trainer.predict(eval_dataset) - for expected, seen in zip(eval_dataset.ys, preds.label_ids): - self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]])) - self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) + # Check predictions + preds = trainer.predict(eval_dataset) + for expected, seen in zip(eval_dataset.ys, preds.label_ids): + self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]])) + self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) - for expected, seen in zip(eval_dataset.xs, preds.predictions): - self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]])) - self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) + for expected, seen in zip(eval_dataset.xs, preds.predictions): + self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]])) + self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) # Same tests with eval accumulation - args = TrainingArguments("./regression", eval_accumulation_steps=2, report_to="none") - trainer = Trainer(model, args, eval_dataset=eval_dataset) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(tmp_dir, eval_accumulation_steps=2, report_to="none") + trainer = Trainer(model, args, eval_dataset=eval_dataset) - # Check evaluation can run to completion - _ = trainer.evaluate() + # Check evaluation can run to completion + _ = trainer.evaluate() - # Check predictions - preds = trainer.predict(eval_dataset) - for expected, seen in zip(eval_dataset.ys, preds.label_ids): - self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]])) - self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) + # Check predictions + preds = trainer.predict(eval_dataset) + for expected, seen in zip(eval_dataset.ys, preds.label_ids): + self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]])) + self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) - for expected, seen in zip(eval_dataset.xs, preds.predictions): - self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]])) - self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) + for expected, seen in zip(eval_dataset.xs, preds.predictions): + self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]])) + self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) def test_log_level(self): # testing only --log_level (--log_level_replica requires multiple gpus and DDP and is tested elsewhere) @@ -2558,27 +2682,29 @@ def test_log_level(self): # test with the default log_level - should be the same as before and thus we test depending on is_info is_info = logging.get_verbosity() <= 20 - with CaptureLogger(logger) as cl: - trainer = get_regression_trainer() - trainer.train() - if is_info: - self.assertIn(log_info_string, cl.out) - else: - self.assertNotIn(log_info_string, cl.out) - with LoggingLevel(logging.INFO): - # test with low log_level - lower than info + with tempfile.TemporaryDirectory() as tmp_dir: with CaptureLogger(logger) as cl: - trainer = get_regression_trainer(log_level="debug") + trainer = get_regression_trainer(output_dir=tmp_dir) trainer.train() - self.assertIn(log_info_string, cl.out) + if is_info: + self.assertIn(log_info_string, cl.out) + else: + self.assertNotIn(log_info_string, cl.out) - with LoggingLevel(logging.INFO): - # test with high log_level - should be quiet - with CaptureLogger(logger) as cl: - trainer = get_regression_trainer(log_level="error") - trainer.train() - self.assertNotIn(log_info_string, cl.out) + with LoggingLevel(logging.INFO): + # test with low log_level - lower than info + with CaptureLogger(logger) as cl: + trainer = get_regression_trainer(log_level="debug", output_dir=tmp_dir) + trainer.train() + self.assertIn(log_info_string, cl.out) + + with LoggingLevel(logging.INFO): + # test with high log_level - should be quiet + with CaptureLogger(logger) as cl: + trainer = get_regression_trainer(log_level="error", output_dir=tmp_dir) + trainer.train() + self.assertNotIn(log_info_string, cl.out) def test_save_checkpoints(self): with tempfile.TemporaryDirectory() as tmpdir: @@ -2668,12 +2794,13 @@ def test_run_seq2seq_double_train_wrap_once(self): # since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for # example DataParallel(DataParallel(model)) - trainer = get_regression_trainer() - trainer.train() - model_wrapped_before = trainer.model_wrapped - trainer.train() - model_wrapped_after = trainer.model_wrapped - self.assertIs(model_wrapped_before, model_wrapped_after, "should be not wrapped twice") + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(output_dir=tmp_dir) + trainer.train() + model_wrapped_before = trainer.model_wrapped + trainer.train() + model_wrapped_after = trainer.model_wrapped + self.assertIs(model_wrapped_before, model_wrapped_after, "should be not wrapped twice") @require_torch_up_to_2_accelerators def test_can_resume_training(self): @@ -2762,17 +2889,18 @@ def test_can_resume_training(self): # Now check failures # 1. fail to find a bogus checkpoint - trainer = get_regression_trainer() - with self.assertRaises(Exception) as context: - trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus") - self.assertTrue("Can't find a valid checkpoint at" in str(context.exception)) + with tempfile.TemporaryDirectory() as tmpdir: + trainer = get_regression_trainer(output_dir=tmpdir) + with self.assertRaises(Exception) as context: + trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus") + self.assertTrue("Can't find a valid checkpoint at" in str(context.exception)) # 2. fail to find any checkpoint - due a fresh output_dir - output_dir2 = self.get_auto_remove_tmp_dir() - trainer = get_regression_trainer(output_dir=output_dir2) - with self.assertRaises(Exception) as context: - trainer.train(resume_from_checkpoint=True) - self.assertTrue("No valid checkpoint found in output directory" in str(context.exception)) + with tempfile.TemporaryDirectory() as tmpdir: + trainer = get_regression_trainer(output_dir=tmpdir) + with self.assertRaises(Exception) as context: + trainer.train(resume_from_checkpoint=True) + self.assertTrue("No valid checkpoint found in output directory" in str(context.exception)) @unittest.skip( reason="@muellerzr: Fix once Trainer can take an accelerate configuration. Need to set `seedable_sampler=True`." @@ -3185,10 +3313,11 @@ def test_trainer_eval_mrpc(self): ) eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev") - training_args = TrainingArguments(output_dir="./examples", use_cpu=True, report_to="none") - trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset) - result = trainer.evaluate() - self.assertLess(result["eval_loss"], 0.2) + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = TrainingArguments(output_dir=tmp_dir, use_cpu=True, report_to="none") + trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset) + result = trainer.evaluate() + self.assertLess(result["eval_loss"], 0.2) @slow def test_trainer_eval_multiple(self): @@ -3202,23 +3331,24 @@ def test_trainer_eval_multiple(self): ) for example in dataset.examples: example["labels"] = example["input_ids"] - training_args = TrainingArguments( - output_dir="./examples", - use_cpu=True, - per_device_eval_batch_size=1, - report_to="none", - ) - trainer = Trainer( - model=model, - args=training_args, - eval_dataset={ - "data1": dataset, - "data2": dataset, - }, - ) - result = trainer.evaluate() - self.assertIn("eval_data1_loss", result) - self.assertIn("eval_data2_loss", result) + with tempfile.TemporaryDirectory() as tmp_dir: + training_args = TrainingArguments( + output_dir=tmp_dir, + use_cpu=True, + per_device_eval_batch_size=1, + report_to="none", + ) + trainer = Trainer( + model=model, + args=training_args, + eval_dataset={ + "data1": dataset, + "data2": dataset, + }, + ) + result = trainer.evaluate() + self.assertIn("eval_data1_loss", result) + self.assertIn("eval_data2_loss", result) @slow def test_trainer_eval_lm(self): @@ -3237,14 +3367,15 @@ def test_training_iterable_dataset(self): # Adding one column not used by the model should have no impact train_dataset = SampleIterableDataset(label_names=["labels", "extra"]) - args = RegressionTrainingArguments(output_dir="./examples", max_steps=4) - trainer = Trainer(model=model, args=args, train_dataset=train_dataset) - trainer.train() - self.assertEqual(trainer.state.global_step, 4) + with tempfile.TemporaryDirectory() as tmp_dir: + args = RegressionTrainingArguments(output_dir=tmp_dir, max_steps=4) + trainer = Trainer(model=model, args=args, train_dataset=train_dataset) + trainer.train() + self.assertEqual(trainer.state.global_step, 4) - loader = trainer.get_train_dataloader() - self.assertIsInstance(loader, torch.utils.data.DataLoader) - self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler) + loader = trainer.get_train_dataloader() + self.assertIsInstance(loader, torch.utils.data.DataLoader) + self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler) def test_evaluation_iterable_dataset(self): config = RegressionModelConfig(a=1.5, b=2.5) @@ -3252,61 +3383,70 @@ def test_evaluation_iterable_dataset(self): # Adding one column not used by the model should have no impact eval_dataset = SampleIterableDataset(label_names=["labels", "extra"]) - args = RegressionTrainingArguments(output_dir="./examples") - trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy()) - results = trainer.evaluate() + with tempfile.TemporaryDirectory() as tmp_dir: + args = RegressionTrainingArguments(output_dir=tmp_dir) + trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy()) + results = trainer.evaluate() - x, y = trainer.eval_dataset.dataset.x, trainer.eval_dataset.dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + x, y = trainer.eval_dataset.dataset.x, trainer.eval_dataset.dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) - # With a number of elements not a round multiple of the batch size - eval_dataset = SampleIterableDataset(length=66) - results = trainer.evaluate(eval_dataset) + # With a number of elements not a round multiple of the batch size + eval_dataset = SampleIterableDataset(length=66) + results = trainer.evaluate(eval_dataset) - x, y = eval_dataset.dataset.x, eval_dataset.dataset.ys[0] - pred = 1.5 * x + 2.5 - expected_loss = ((pred - y) ** 2).mean() - self.assertAlmostEqual(results["eval_loss"], expected_loss) - expected_acc = AlmostAccuracy()((pred, y))["accuracy"] - self.assertAlmostEqual(results["eval_accuracy"], expected_acc) + x, y = eval_dataset.dataset.x, eval_dataset.dataset.ys[0] + pred = 1.5 * x + 2.5 + expected_loss = ((pred - y) ** 2).mean() + self.assertAlmostEqual(results["eval_loss"], expected_loss) + expected_acc = AlmostAccuracy()((pred, y))["accuracy"] + self.assertAlmostEqual(results["eval_accuracy"], expected_acc) def test_predict_iterable_dataset(self): config = RegressionModelConfig(a=1.5, b=2.5) model = RegressionPreTrainedModel(config) eval_dataset = SampleIterableDataset() - args = RegressionTrainingArguments(output_dir="./examples") - trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy()) + with tempfile.TemporaryDirectory() as tmp_dir: + args = RegressionTrainingArguments(output_dir=tmp_dir) + trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy()) - preds = trainer.predict(trainer.eval_dataset).predictions - x = eval_dataset.dataset.x - self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) + preds = trainer.predict(trainer.eval_dataset).predictions + x = eval_dataset.dataset.x + self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) - # With a number of elements not a round multiple of the batch size - # Adding one column not used by the model should have no impact - test_dataset = SampleIterableDataset(length=66, label_names=["labels", "extra"]) - preds = trainer.predict(test_dataset).predictions - x = test_dataset.dataset.x - self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) + # With a number of elements not a round multiple of the batch size + # Adding one column not used by the model should have no impact + test_dataset = SampleIterableDataset(length=66, label_names=["labels", "extra"]) + preds = trainer.predict(test_dataset).predictions + x = test_dataset.dataset.x + self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) def test_num_train_epochs_in_training(self): # len(train_dl) < gradient_accumulation_steps shouldn't give ``ZeroDivisionError`` when ``max_steps`` is given. # It should give 1 update step for each epoch. - trainer = get_regression_trainer( - max_steps=3, train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5 - ) - train_output = trainer.train() - self.assertEqual(train_output.global_step, 3) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + max_steps=3, + train_len=64, + per_device_train_batch_size=16, + gradient_accumulation_steps=5, + output_dir=tmp_dir, + ) + train_output = trainer.train() + self.assertEqual(train_output.global_step, 3) - # Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if - # len(train_dl) < gradient_accumulation_steps. - trainer = get_regression_trainer(train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5) - train_output = trainer.train() - self.assertEqual(train_output.global_step, int(self.n_epochs)) + # Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if + # len(train_dl) < gradient_accumulation_steps. + trainer = get_regression_trainer( + train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5, output_dir=tmp_dir + ) + train_output = trainer.train() + self.assertEqual(train_output.global_step, int(self.n_epochs)) def test_early_stopping_callback(self): # early stopping stops training before num_training_epochs @@ -3345,22 +3485,23 @@ def test_early_stopping_callback(self): self.assertEqual(trainer.state.global_step, 0) def test_flos_extraction(self): - trainer = get_regression_trainer(learning_rate=0.1) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir) - def assert_flos_extraction(trainer, wrapped_model_to_check): - self.assertEqual(trainer.model, trainer.accelerator.unwrap_model(wrapped_model_to_check)) - self.assertGreaterEqual( - getattr(trainer.accelerator.unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0 - ) + def assert_flos_extraction(trainer, wrapped_model_to_check): + self.assertEqual(trainer.model, trainer.accelerator.unwrap_model(wrapped_model_to_check)) + self.assertGreaterEqual( + getattr(trainer.accelerator.unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0 + ) - # with plain model - assert_flos_extraction(trainer, trainer.model) + # with plain model + assert_flos_extraction(trainer, trainer.model) - # with enforced DataParallel - assert_flos_extraction(trainer, nn.DataParallel(trainer.model)) + # with enforced DataParallel + assert_flos_extraction(trainer, nn.DataParallel(trainer.model)) - trainer.train() - self.assertTrue(isinstance(trainer.state.total_flos, float)) + trainer.train() + self.assertTrue(isinstance(trainer.state.total_flos, float)) def check_checkpoint_deletion(self, trainer, output_dir, expected): # Make fake checkpoints @@ -3452,13 +3593,14 @@ def check_mem_metrics(self, trainer, check_func): check_func("test_mem_gpu_alloc_delta", metrics) def test_mem_metrics(self): - # with mem metrics enabled - trainer = get_regression_trainer(skip_memory_metrics=False) - self.check_mem_metrics(trainer, self.assertIn) + with tempfile.TemporaryDirectory() as tmp_dir: + # with mem metrics enabled + trainer = get_regression_trainer(skip_memory_metrics=False, output_dir=tmp_dir) + self.check_mem_metrics(trainer, self.assertIn) - # with mem metrics disabled - trainer = get_regression_trainer(skip_memory_metrics=True) - self.check_mem_metrics(trainer, self.assertNotIn) + # with mem metrics disabled + trainer = get_regression_trainer(skip_memory_metrics=True, output_dir=tmp_dir) + self.check_mem_metrics(trainer, self.assertNotIn) @require_torch_accelerator def test_fp16_full_eval(self): @@ -3467,55 +3609,60 @@ def test_fp16_full_eval(self): debug = 0 n_gpus = backend_device_count(torch_device) - bs = 8 - eval_len = 16 * n_gpus - # make the params somewhat big so that there will be enough RAM consumed to be able to - # measure things. We should get about 64KB for a+b in fp32 - a = torch.ones(1000, bs) + 0.001 - b = torch.ones(1000, bs) - 0.001 - - # 1. with fp16_full_eval disabled - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, skip_memory_metrics=False) - metrics = trainer.evaluate() - del trainer - gc.collect() - - fp32_init = metrics["init_mem_gpu_alloc_delta"] - fp32_eval = metrics["eval_mem_gpu_alloc_delta"] - - if debug: - print(f"fp32_init {fp32_init}") - print(f"fp32_eval {fp32_eval}") - - # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram. - # perfect world: fp32_init == 64<<10 - self.assertGreater(fp32_init, 59_000) - # after eval should be no extra memory allocated - with a small margin (other than the peak - # memory consumption for the forward calculation that gets recovered) - # perfect world: fp32_eval == close to zero - self.assertLess(fp32_eval, 5_000) - - # 2. with fp16_full_eval enabled - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False) - metrics = trainer.evaluate() - fp16_init = metrics["init_mem_gpu_alloc_delta"] - fp16_eval = metrics["eval_mem_gpu_alloc_delta"] - - if debug: - print(f"fp16_init {fp16_init}") - print(f"fp16_eval {fp16_eval}") - - # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0 - # perfect world: fp16_init == close to zero - self.assertLess(fp16_init, 5_000) - # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back) - # perfect world: fp32_init == 32<<10 - self.assertGreater(fp16_eval, 27_000) - - # 3. relative comparison fp32 vs full fp16 - # should be about half of fp16_init - # perfect world: fp32_init/2 == fp16_eval - self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000) + with tempfile.TemporaryDirectory() as tmp_dir: + bs = 8 + eval_len = 16 * n_gpus + # make the params somewhat big so that there will be enough RAM consumed to be able to + # measure things. We should get about 64KB for a+b in fp32 + a = torch.ones(1000, bs) + 0.001 + b = torch.ones(1000, bs) - 0.001 + + # 1. with fp16_full_eval disabled + trainer = get_regression_trainer( + a=a, b=b, eval_len=eval_len, skip_memory_metrics=False, output_dir=tmp_dir + ) + metrics = trainer.evaluate() + del trainer + gc.collect() + + fp32_init = metrics["init_mem_gpu_alloc_delta"] + fp32_eval = metrics["eval_mem_gpu_alloc_delta"] + + if debug: + print(f"fp32_init {fp32_init}") + print(f"fp32_eval {fp32_eval}") + + # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram. + # perfect world: fp32_init == 64<<10 + self.assertGreater(fp32_init, 59_000) + # after eval should be no extra memory allocated - with a small margin (other than the peak + # memory consumption for the forward calculation that gets recovered) + # perfect world: fp32_eval == close to zero + self.assertLess(fp32_eval, 5_000) + + # 2. with fp16_full_eval enabled + trainer = get_regression_trainer( + a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False, output_dir=tmp_dir + ) + metrics = trainer.evaluate() + fp16_init = metrics["init_mem_gpu_alloc_delta"] + fp16_eval = metrics["eval_mem_gpu_alloc_delta"] + + if debug: + print(f"fp16_init {fp16_init}") + print(f"fp16_eval {fp16_eval}") + + # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0 + # perfect world: fp16_init == close to zero + self.assertLess(fp16_init, 5_000) + # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back) + # perfect world: fp32_init == 32<<10 + self.assertGreater(fp16_eval, 27_000) + + # 3. relative comparison fp32 vs full fp16 + # should be about half of fp16_init + # perfect world: fp32_init/2 == fp16_eval + self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000) @require_non_xpu @require_torch_non_multi_gpu @@ -3534,30 +3681,31 @@ def test_torchdynamo_full_eval(self): a = torch.ones(1000, bs) + 0.001 b = torch.ones(1000, bs) - 0.001 - # 1. Default - without TorchDynamo - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len) - metrics = trainer.evaluate() - original_eval_loss = metrics["eval_loss"] - del trainer - - # 2. TorchDynamo eager - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="eager") - metrics = trainer.evaluate() - self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) - del trainer - torchdynamo.reset() - - # 3. TorchDynamo nvfuser - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="nvfuser") - metrics = trainer.evaluate() - self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) - torchdynamo.reset() - - # 4. TorchDynamo fx2trt - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="fx2trt") - metrics = trainer.evaluate() - self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) - torchdynamo.reset() + with tempfile.TemporaryDirectory() as tmp_dir: + # 1. Default - without TorchDynamo + trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, output_dir=tmp_dir) + metrics = trainer.evaluate() + original_eval_loss = metrics["eval_loss"] + del trainer + + # 2. TorchDynamo eager + trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="eager", output_dir=tmp_dir) + metrics = trainer.evaluate() + self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) + del trainer + torchdynamo.reset() + + # 3. TorchDynamo nvfuser + trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="nvfuser", output_dir=tmp_dir) + metrics = trainer.evaluate() + self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) + torchdynamo.reset() + + # 4. TorchDynamo fx2trt + trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="fx2trt", output_dir=tmp_dir) + metrics = trainer.evaluate() + self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) + torchdynamo.reset() @unittest.skip(reason="torch 2.0.0 gives `ModuleNotFoundError: No module named 'torchdynamo'`.") @require_torch_non_multi_gpu @@ -3606,30 +3754,31 @@ def forward(self, x): del trainer # 2. TorchDynamo nvfuser - a = torch.ones(1024, 1024, device="cuda", requires_grad=True) - a.grad = None - args = TrainingArguments(output_dir="None", torchdynamo="nvfuser") - trainer = CustomTrainer(model=mod, args=args) - # warmup - for _ in range(10): - loss = trainer.training_step(mod, {"x": a}) - - # resets - gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + with tempfile.TemporaryDirectory() as tmp_dir: + a = torch.ones(1024, 1024, device="cuda", requires_grad=True) + a.grad = None + args = TrainingArguments(output_dir=tmp_dir, torchdynamo="nvfuser") + trainer = CustomTrainer(model=mod, args=args) + # warmup + for _ in range(10): + loss = trainer.training_step(mod, {"x": a}) + + # resets + gc.collect() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() - loss = trainer.training_step(mod, {"x": a}) - peak_mem = torch.cuda.max_memory_allocated() - torchdynamo.reset() - del trainer + loss = trainer.training_step(mod, {"x": a}) + peak_mem = torch.cuda.max_memory_allocated() + torchdynamo.reset() + del trainer - # Functional check - self.assertAlmostEqual(loss, orig_loss) + # Functional check + self.assertAlmostEqual(loss, orig_loss) - # AOT Autograd recomputaion and nvfuser recomputation optimization - # aggressively fuses the operations and reduce the memory footprint. - self.assertGreater(orig_peak_mem, peak_mem * 2) + # AOT Autograd recomputaion and nvfuser recomputation optimization + # aggressively fuses the operations and reduce the memory footprint. + self.assertGreater(orig_peak_mem, peak_mem * 2) @require_torch_accelerator @require_torch_bf16 @@ -3648,48 +3797,53 @@ def test_bf16_full_eval(self): a = torch.ones(1000, bs) + 0.001 b = torch.ones(1000, bs) - 0.001 - # 1. with bf16_full_eval disabled - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, skip_memory_metrics=False) - metrics = trainer.evaluate() - del trainer - gc.collect() - - fp32_init = metrics["init_mem_gpu_alloc_delta"] - fp32_eval = metrics["eval_mem_gpu_alloc_delta"] - - if debug: - print(f"fp32_init {fp32_init}") - print(f"fp32_eval {fp32_eval}") - - # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram. - # perfect world: fp32_init == 64<<10 - self.assertGreater(fp32_init, 59_000) - # after eval should be no extra memory allocated - with a small margin (other than the peak - # memory consumption for the forward calculation that gets recovered) - # perfect world: fp32_eval == close to zero - self.assertLess(fp32_eval, 5_000) - - # 2. with bf16_full_eval enabled - trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, bf16_full_eval=True, skip_memory_metrics=False) - metrics = trainer.evaluate() - bf16_init = metrics["init_mem_gpu_alloc_delta"] - bf16_eval = metrics["eval_mem_gpu_alloc_delta"] - - if debug: - print(f"bf16_init {bf16_init}") - print(f"bf16_eval {bf16_eval}") - - # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0 - # perfect world: bf16_init == close to zero - self.assertLess(bf16_init, 5_000) - # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back) - # perfect world: fp32_init == 32<<10 - self.assertGreater(bf16_eval, 27_000) - - # 3. relative comparison fp32 vs full bf16 - # should be about half of bf16_init - # perfect world: fp32_init/2 == bf16_eval - self.assertAlmostEqual(bf16_eval, fp32_init / 2, delta=5_000) + with tempfile.TemporaryDirectory() as tmp_dir: + # 1. with bf16_full_eval disabled + trainer = get_regression_trainer( + a=a, b=b, eval_len=eval_len, skip_memory_metrics=False, output_dir=tmp_dir + ) + metrics = trainer.evaluate() + del trainer + gc.collect() + + fp32_init = metrics["init_mem_gpu_alloc_delta"] + fp32_eval = metrics["eval_mem_gpu_alloc_delta"] + + if debug: + print(f"fp32_init {fp32_init}") + print(f"fp32_eval {fp32_eval}") + + # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram. + # perfect world: fp32_init == 64<<10 + self.assertGreater(fp32_init, 59_000) + # after eval should be no extra memory allocated - with a small margin (other than the peak + # memory consumption for the forward calculation that gets recovered) + # perfect world: fp32_eval == close to zero + self.assertLess(fp32_eval, 5_000) + + # 2. with bf16_full_eval enabled + trainer = get_regression_trainer( + a=a, b=b, eval_len=eval_len, bf16_full_eval=True, skip_memory_metrics=False, output_dir=tmp_dir + ) + metrics = trainer.evaluate() + bf16_init = metrics["init_mem_gpu_alloc_delta"] + bf16_eval = metrics["eval_mem_gpu_alloc_delta"] + + if debug: + print(f"bf16_init {bf16_init}") + print(f"bf16_eval {bf16_eval}") + + # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0 + # perfect world: bf16_init == close to zero + self.assertLess(bf16_init, 5_000) + # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back) + # perfect world: fp32_init == 32<<10 + self.assertGreater(bf16_eval, 27_000) + + # 3. relative comparison fp32 vs full bf16 + # should be about half of bf16_init + # perfect world: fp32_init/2 == bf16_eval + self.assertAlmostEqual(bf16_eval, fp32_init / 2, delta=5_000) def test_no_wd_param_group(self): model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)])) @@ -3757,9 +3911,7 @@ def test_accelerator_config_empty(self): eval_dataset = SampleIterableDataset() # Leaves one option as something *not* basic - args = RegressionTrainingArguments( - output_dir=tmp_dir, - ) + args = RegressionTrainingArguments(output_dir=tmp_dir) trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) self.assertEqual(trainer.accelerator.split_batches, False) self.assertEqual(trainer.accelerator.dispatch_batches, None) @@ -3788,10 +3940,7 @@ def test_accelerator_config_from_dict(self): accelerator_config["gradient_accumulation_kwargs"] = {"sync_each_batch": True} # Leaves all options as something *not* basic - args = RegressionTrainingArguments( - output_dir=tmp_dir, - accelerator_config=accelerator_config, - ) + args = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config=accelerator_config) trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) self.assertEqual(trainer.accelerator.split_batches, True) self.assertEqual(trainer.accelerator.dispatch_batches, True) @@ -4052,11 +4201,12 @@ def test_eval_use_gather_object(self): train_dataset = RegressionDataset() eval_dataset = RegressionDataset() model = RegressionDictModel() - args = TrainingArguments("./regression", report_to="none", eval_use_gather_object=True) - trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) - trainer.train() - _ = trainer.evaluate() - _ = trainer.predict(eval_dataset) + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(tmp_dir, report_to="none", eval_use_gather_object=True) + trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) + trainer.train() + _ = trainer.evaluate() + _ = trainer.predict(eval_dataset) def test_trainer_saves_tokenizer(self): MODEL_ID = "google-bert/bert-base-uncased" @@ -4683,22 +4833,22 @@ def hp_name(trial): optim_test_params = [ ( - TrainingArguments(optim=OptimizerNames.ADAMW_HF, output_dir="None"), + OptimizerNames.ADAMW_HF, transformers.optimization.AdamW, default_adam_kwargs, ), ( - TrainingArguments(optim=OptimizerNames.ADAMW_HF.value, output_dir="None"), + OptimizerNames.ADAMW_HF.value, transformers.optimization.AdamW, default_adam_kwargs, ), ( - TrainingArguments(optim=OptimizerNames.ADAMW_TORCH, output_dir="None"), + OptimizerNames.ADAMW_TORCH, torch.optim.AdamW, default_adam_kwargs, ), ( - TrainingArguments(optim=OptimizerNames.ADAFACTOR, output_dir="None"), + OptimizerNames.ADAFACTOR, transformers.optimization.Adafactor, { "scale_parameter": False, @@ -4713,7 +4863,7 @@ def hp_name(trial): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir="None"), + OptimizerNames.ADAMW_APEX_FUSED, apex.optimizers.FusedAdam, default_adam_kwargs, ) @@ -4724,7 +4874,7 @@ def hp_name(trial): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None"), + OptimizerNames.ADAMW_BNB, bnb.optim.AdamW, default_adam_kwargs, ) @@ -4732,7 +4882,7 @@ def hp_name(trial): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir="None"), + OptimizerNames.ADAMW_8BIT, bnb.optim.AdamW, default_adam_kwargs, ) @@ -4740,7 +4890,7 @@ def hp_name(trial): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None"), + OptimizerNames.PAGED_ADAMW, bnb.optim.AdamW, default_adam_kwargs, ) @@ -4748,7 +4898,7 @@ def hp_name(trial): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None"), + OptimizerNames.PAGED_ADAMW_8BIT, bnb.optim.AdamW, default_adam_kwargs, ) @@ -4756,7 +4906,7 @@ def hp_name(trial): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.LION, output_dir="None"), + OptimizerNames.LION, bnb.optim.Lion, default_lion_kwargs, ) @@ -4764,7 +4914,7 @@ def hp_name(trial): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir="None"), + OptimizerNames.LION_8BIT, bnb.optim.Lion, default_lion_kwargs, ) @@ -4772,7 +4922,7 @@ def hp_name(trial): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None"), + OptimizerNames.PAGED_LION_8BIT, bnb.optim.Lion, default_lion_kwargs, ) @@ -4781,28 +4931,28 @@ def hp_name(trial): if version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.44.0"): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir="None"), + OptimizerNames.ADEMAMIX, bnb.optim.AdEMAMix, default_ademamix_kwargs, ) ) optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir="None"), + OptimizerNames.ADEMAMIX_8BIT, bnb.optim.AdEMAMix, default_ademamix_kwargs, ) ) optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir="None"), + OptimizerNames.PAGED_ADEMAMIX_8BIT, bnb.optim.AdEMAMix, default_ademamix_kwargs, ) ) optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir="None"), + OptimizerNames.PAGED_ADEMAMIX, bnb.optim.AdEMAMix, default_ademamix_kwargs, ) @@ -4813,7 +4963,7 @@ def hp_name(trial): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir="None"), + OptimizerNames.ADAMW_ANYPRECISION, torchdistx.optimizers.AnyPrecisionAdamW, dict(default_adam_kwargs, **default_anyprecision_kwargs), ) @@ -4823,7 +4973,7 @@ def hp_name(trial): optim_test_params.append( ( - TrainingArguments(optim=OptimizerNames.ADAMW_TORCH_4BIT, output_dir="None"), + OptimizerNames.ADAMW_TORCH_4BIT, torchao.prototype.low_bit_optim.AdamW4bit, default_adam_kwargs, ) @@ -4843,12 +4993,13 @@ def check_optim_and_kwargs(self, training_args: TrainingArguments, expected_cls, self.assertTrue(actual_v == v, f"Failed check for {p}. Expected {v}, but got {actual_v}.") @parameterized.expand(optim_test_params, skip_on_empty=True) - def test_optim_supported(self, training_args: TrainingArguments, expected_cls, expected_kwargs): - # exercises all the valid --optim options - self.check_optim_and_kwargs(training_args, expected_cls, expected_kwargs) + def test_optim_supported(self, optim: str, expected_cls, expected_kwargs): + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer(output_dir=tmp_dir, optim=optim) - trainer = get_regression_trainer(**training_args.to_dict()) - trainer.train() + # exercises all the valid --optim options + self.check_optim_and_kwargs(trainer.args, expected_cls, expected_kwargs) + trainer.train() def test_fused_adam(self): # Pretend that apex is installed and mock apex.optimizers.FusedAdam exists. @@ -4861,21 +5012,23 @@ def test_fused_adam(self): "apex.optimizers": mock.optimizers, "apex.optimizers.FusedAdam": mock.optimizers.FusedAdam, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir="None"), - mock.optimizers.FusedAdam, - default_adam_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir=tmp_dir), + mock.optimizers.FusedAdam, + default_adam_kwargs, + ) def test_fused_adam_no_apex(self): - args = TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir=tmp_dir) - # Pretend that apex does not exist, even if installed. By setting apex to None, importing - # apex will fail even if apex is installed. - with patch.dict("sys.modules", {"apex.optimizers": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that apex does not exist, even if installed. By setting apex to None, importing + # apex will fail even if apex is installed. + with patch.dict("sys.modules", {"apex.optimizers": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) def test_bnb_adam8bit(self): # Pretend that Bits and Bytes is installed and mock bnb.optim.Adam8bit exists. @@ -4888,12 +5041,13 @@ def test_bnb_adam8bit(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.AdamW": mock.optim.AdamW, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None"), - mock.optim.AdamW, - default_adam_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir=tmp_dir), + mock.optim.AdamW, + default_adam_kwargs, + ) def test_bnb_paged_adam8bit_alias(self): mock = Mock() @@ -4902,12 +5056,13 @@ def test_bnb_paged_adam8bit_alias(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.AdamW": mock.optim.AdamW, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir="None"), - mock.optim.AdamW, - default_adam_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir=tmp_dir), + mock.optim.AdamW, + default_adam_kwargs, + ) def test_bnb_paged_adam(self): mock = Mock() @@ -4916,12 +5071,13 @@ def test_bnb_paged_adam(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.AdamW": mock.optim.AdamW, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None"), - mock.optim.AdamW, - default_adam_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir=tmp_dir), + mock.optim.AdamW, + default_adam_kwargs, + ) def test_bnb_paged_adam8bit(self): mock = Mock() @@ -4930,12 +5086,13 @@ def test_bnb_paged_adam8bit(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.AdamW": mock.optim.AdamW, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None"), - mock.optim.AdamW, - default_adam_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir=tmp_dir), + mock.optim.AdamW, + default_adam_kwargs, + ) def test_bnb_ademamix(self): mock = Mock() @@ -4944,12 +5101,13 @@ def test_bnb_ademamix(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir="None"), - mock.optim.AdEMAMix, - default_ademamix_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir=tmp_dir), + mock.optim.AdEMAMix, + default_ademamix_kwargs, + ) def test_bnb_ademamix8bit(self): mock = Mock() @@ -4958,12 +5116,13 @@ def test_bnb_ademamix8bit(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir="None"), - mock.optim.AdEMAMix, - default_ademamix_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir=tmp_dir), + mock.optim.AdEMAMix, + default_ademamix_kwargs, + ) def test_bnb_paged_ademamix(self): mock = Mock() @@ -4972,12 +5131,13 @@ def test_bnb_paged_ademamix(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir="None"), - mock.optim.AdEMAMix, - default_ademamix_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir=tmp_dir), + mock.optim.AdEMAMix, + default_ademamix_kwargs, + ) def test_bnb_paged_ademamix8bit(self): mock = Mock() @@ -4986,12 +5146,13 @@ def test_bnb_paged_ademamix8bit(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir="None"), - mock.optim.AdEMAMix, - default_ademamix_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir=tmp_dir), + mock.optim.AdEMAMix, + default_ademamix_kwargs, + ) def test_bnb_lion(self): mock = Mock() @@ -5000,12 +5161,13 @@ def test_bnb_lion(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.Lion": mock.optim.Lion, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.LION, output_dir="None"), - mock.optim.Lion, - default_lion_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.LION, output_dir=tmp_dir), + mock.optim.Lion, + default_lion_kwargs, + ) def test_bnb_lion8bit(self): mock = Mock() @@ -5014,12 +5176,13 @@ def test_bnb_lion8bit(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.Lion": mock.optim.Lion, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir="None"), - mock.optim.Lion, - default_lion_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir=tmp_dir), + mock.optim.Lion, + default_lion_kwargs, + ) def test_bnb_paged_lion8bit(self): mock = Mock() @@ -5028,12 +5191,13 @@ def test_bnb_paged_lion8bit(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.Lion": mock.optim.Lion, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None"), - mock.optim.Lion, - default_lion_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir=tmp_dir), + mock.optim.Lion, + default_lion_kwargs, + ) def test_bnb_paged_lion(self): mock = Mock() @@ -5042,93 +5206,103 @@ def test_bnb_paged_lion(self): "bitsandbytes.optim": mock.optim, "bitsandbytes.optim.Lion": mock.optim.Lion, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir="None"), - mock.optim.Lion, - default_lion_kwargs, - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir=tmp_dir), + mock.optim.Lion, + default_lion_kwargs, + ) def test_bnb_adam8bit_no_bnb(self): - args = TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir=tmp_dir) - # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing - # bnb will fail even if `bitsandbytes` is installed. - with patch.dict("sys.modules", {"bitsandbytes.optim": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing + # bnb will fail even if `bitsandbytes` is installed. + with patch.dict("sys.modules", {"bitsandbytes.optim": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) def test_bnb_paged_adam_no_bnb(self): - args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir=tmp_dir) - # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing - # bnb will fail even if `bitsandbytes` is installed. - with patch.dict("sys.modules", {"bitsandbytes.optim": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing + # bnb will fail even if `bitsandbytes` is installed. + with patch.dict("sys.modules", {"bitsandbytes.optim": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) def test_bnb_paged_adam8bit_no_bnb(self): - args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir=tmp_dir) - # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing - # bnb will fail even if `bitsandbytes` is installed. - with patch.dict("sys.modules", {"bitsandbytes.optim": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing + # bnb will fail even if `bitsandbytes` is installed. + with patch.dict("sys.modules", {"bitsandbytes.optim": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) def test_bnb_ademamix_no_bnb(self): - args = TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir=tmp_dir) - # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing - # bnb will fail even if `bitsandbytes` is installed. - with patch.dict("sys.modules", {"bitsandbytes.optim": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing + # bnb will fail even if `bitsandbytes` is installed. + with patch.dict("sys.modules", {"bitsandbytes.optim": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) def test_bnb_ademamix8bit_no_bnb(self): - args = TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir=tmp_dir) - # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing - # bnb will fail even if `bitsandbytes` is installed. - with patch.dict("sys.modules", {"bitsandbytes.optim": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing + # bnb will fail even if `bitsandbytes` is installed. + with patch.dict("sys.modules", {"bitsandbytes.optim": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) def test_bnb_paged_ademamix_no_bnb(self): - args = TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir=tmp_dir) - # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing - # bnb will fail even if `bitsandbytes` is installed. - with patch.dict("sys.modules", {"bitsandbytes.optim": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing + # bnb will fail even if `bitsandbytes` is installed. + with patch.dict("sys.modules", {"bitsandbytes.optim": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) def test_bnb_paged_ademamix8bit_no_bnb(self): - args = TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir=tmp_dir) - # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing - # bnb will fail even if `bitsandbytes` is installed. - with patch.dict("sys.modules", {"bitsandbytes.optim": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing + # bnb will fail even if `bitsandbytes` is installed. + with patch.dict("sys.modules", {"bitsandbytes.optim": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) def test_bnb_paged_lion_no_bnb(self): - args = TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir=tmp_dir) - # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing - # bnb will fail even if `bitsandbytes` is installed. - with patch.dict("sys.modules", {"bitsandbytes.optim": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing + # bnb will fail even if `bitsandbytes` is installed. + with patch.dict("sys.modules", {"bitsandbytes.optim": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) def test_bnb_paged_lion8bit_no_bnb(self): - args = TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir=tmp_dir) - # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing - # bnb will fail even if `bitsandbytes` is installed. - with patch.dict("sys.modules", {"bitsandbytes.optim": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing + # bnb will fail even if `bitsandbytes` is installed. + with patch.dict("sys.modules", {"bitsandbytes.optim": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) def test_anyprecision_adamw(self): # Pretend that torchdistx is installed and mock torchdistx.optimizers.AnyPrecisionAdamW exists. @@ -5141,21 +5315,23 @@ def test_anyprecision_adamw(self): "torchdistx.optimizers": mock.optimizers, "torchdistx.optimizers.AnyPrecisionAdamW.": mock.optimizers.AnyPrecisionAdamW, } - with patch.dict("sys.modules", modules): - self.check_optim_and_kwargs( - TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir="None"), - mock.optimizers.AnyPrecisionAdamW, - dict(default_adam_kwargs, **default_anyprecision_kwargs), - ) + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.dict("sys.modules", modules): + self.check_optim_and_kwargs( + TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir=tmp_dir), + mock.optimizers.AnyPrecisionAdamW, + dict(default_adam_kwargs, **default_anyprecision_kwargs), + ) def test_no_torchdistx_anyprecision_adamw(self): - args = TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir="None") + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir=tmp_dir) - # Pretend that torchdistx does not exist, even if installed. By setting torchdistx to None, importing - # torchdistx.optimizers will fail even if torchdistx is installed. - with patch.dict("sys.modules", {"torchdistx.optimizers": None}): - with self.assertRaises(ValueError): - Trainer.get_optimizer_cls_and_kwargs(args) + # Pretend that torchdistx does not exist, even if installed. By setting torchdistx to None, importing + # torchdistx.optimizers will fail even if torchdistx is installed. + with patch.dict("sys.modules", {"torchdistx.optimizers": None}): + with self.assertRaises(ValueError): + Trainer.get_optimizer_cls_and_kwargs(args) @require_torch