From 9ee591e0bac21be8d1227be90c9b4ab8e3be3d61 Mon Sep 17 00:00:00 2001 From: Teven Date: Tue, 28 Jul 2020 15:31:04 +0200 Subject: [PATCH 01/26] neFLOs calculation, logging, and reloading (#1) --- src/transformers/modeling_utils.py | 24 +++++++++++++++++++++--- src/transformers/trainer.py | 24 +++++++++++++++++++++--- src/transformers/trainer_utils.py | 23 +++++++++++++++++++++++ 3 files changed, 65 insertions(+), 6 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index bd33f7a7a357..4b7986ec9b21 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -92,20 +92,38 @@ class ModuleUtilsMixin: A few utilities for :obj:`torch.nn.Modules`, to be used as a mixin. """ - def num_parameters(self, only_trainable: bool = False) -> int: + def num_parameters(self, only_trainable: bool = False, no_embeddings: bool = False) -> int: """ - Get the number of (optionally, trainable) parameters in the model. + Get number of (optionally, trainable or non-embeddings) parameters in the module. Args: only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to return only the number of trainable parameters + no_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return only the number of non-embeddings parameters + Returns: :obj:`int`: The number of parameters. """ - params = filter(lambda x: x.requires_grad, self.parameters()) if only_trainable else self.parameters() + + def parameter_filter(x): + return (x.requires_grad or not only_trainable) and not ( + isinstance(x, torch.nn.Embedding) and no_embeddings + ) + + params = filter(parameter_filter, self.parameters()) if only_trainable else self.parameters() return sum(p.numel() for p in params) + def floating_point_ops(self, batch_size: int, sequence_length: int, no_embeddings: bool = False) -> int: + """ + Get number of (optionally, non-embeddings) floating-point operations. Default approximation neglects the + quadratic dependency on the number of tokens (valid if 12 * d_model << sequence_length) as laid out in + https://arxiv.org/pdf/2001.08361.pdf section 2.1. Can be overriden for long-form transformers. + """ + + return 6 * batch_size * sequence_length * self.num_parameters(no_embeddings=no_embeddings) + @staticmethod def _hook_rss_memory_pre_forward(module, *args, **kwargs): try: diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 06d467a354e2..aa15d4f2d2c4 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -29,6 +29,7 @@ TrainOutput, is_wandb_available, set_seed, + estimate_tokens, ) from .training_args import TrainingArguments @@ -172,6 +173,7 @@ class Trainer: optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None global_step: Optional[int] = None epoch: Optional[float] = None + non_embedding_flos: Optional[int] = None def __init__( self, @@ -469,6 +471,7 @@ def train(self, model_path: Optional[str] = None): self.global_step = 0 self.epoch = 0 + self.non_embedding_flos = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint @@ -476,6 +479,7 @@ def train(self, model_path: Optional[str] = None): # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) + self.non_embedding_flos = getattr(model.config, "non_embedding_flos", 0) epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps @@ -484,9 +488,13 @@ def train(self, model_path: Optional[str] = None): logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) + logger.info( + " Continuing training from %d non-embedding floating-point operations", self.non_embedding_flos + ) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 + self.non_embedding_flos = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 @@ -539,6 +547,10 @@ def train(self, model_path: Optional[str] = None): model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) + self.non_embedding_flos += 6 * model.floating_point_ops( + *estimate_tokens(inputs), no_embeddings=True + ) + print(self.non_embedding_flos) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step @@ -581,10 +593,10 @@ def train(self, model_path: Optional[str] = None): torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) - if self.args.max_steps > 0 and self.global_step > self.args.max_steps: + if self.global_step > self.args.max_steps > 0: epoch_iterator.close() break - if self.args.max_steps > 0 and self.global_step > self.args.max_steps: + if self.global_step > self.args.max_steps > 0: train_iterator.close() break if self.args.tpu_metrics_debug or self.args.debug: @@ -621,6 +633,8 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: if self.epoch is not None: logs["epoch"] = self.epoch + if self.non_embedding_flos is not None: + logs["non_embedding_flos"] = self.non_embedding_flos if self.global_step is None: # when logging evaluation metrics without training self.global_step = 0 @@ -642,7 +656,7 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: if is_wandb_available(): if self.is_world_master(): wandb.log(logs, step=self.global_step) - output = {**logs, **{"step": self.global_step}} + output = {**logs, **{"step": self.global_step, "neFLOs": self.non_embedding_flos}} if iterator is not None: iterator.write(output) else: @@ -761,6 +775,8 @@ def _save_tpu(self, output_dir: Optional[str] = None): raise ValueError("Trainer.model appears to not be a PreTrainedModel") xm.rendezvous("saving_checkpoint") + # Storing the number of floating-point operations that went into the model + self.model.config.non_embedding_flos = self.non_embedding_flos self.model.save_pretrained(output_dir) def _save(self, output_dir: Optional[str] = None): @@ -771,6 +787,8 @@ def _save(self, output_dir: Optional[str] = None): # They can then be reloaded using `from_pretrained()` if not isinstance(self.model, PreTrainedModel): raise ValueError("Trainer.model appears to not be a PreTrainedModel") + # Storing the number of floating-point operations that went into the model + self.model.config.non_embedding_flos = self.non_embedding_flos self.model.save_pretrained(output_dir) # Good practice: save your training arguments together with the trained model diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 1a4e99507236..867c8f151396 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -1,6 +1,7 @@ import os import random from typing import Dict, NamedTuple, Optional +import warnings import numpy as np @@ -46,6 +47,28 @@ def set_seed(seed: int): tf.random.set_seed(seed) +def estimate_tokens(inputs): + """ + Helper function to estimate the batch size and sequence length from the model inputs + + Args: + inputs (:obj:`dict`): The model inputs. + + Returns: + seed (:obj:`tuple`): The batch size and sequence length. + """ + inputs_ids = inputs.get("input_ids") + input_embeds = inputs.get("input_embeds") + if inputs is not None: + return inputs_ids.shape[0], inputs_ids.shape[1] + if input_embeds is not None: + return input_embeds.shape[0], input_embeds.shape[1] + warnings.warn( + "Could not estimate the number of tokens of the input, floating-point operations will" "not be computed" + ) + return 0, 0 + + class EvalPrediction(NamedTuple): """ Evaluation output (always contains labels), to be used to compute metrics. From b50d3e175f32c64fef8b0b014f316cc52b5cf28a Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Mon, 3 Aug 2020 19:13:05 +0200 Subject: [PATCH 02/26] testing distributed consecutive batches --- src/transformers/data/data_collator.py | 3 +++ src/transformers/trainer.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 085f7a68a8aa..6ef95f98ce3a 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -93,6 +93,9 @@ def __call__(self, examples: List[Union[torch.Tensor, Dict[str, torch.Tensor]]]) def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor: length_of_first = examples[0].size(0) are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) + for example in self.tokenizer.batch_decode(examples): + print(example) + print("-" * 89) if are_tensors_same_length: return torch.stack(examples, dim=0) else: diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 35554af22650..27d24a121ce3 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -133,7 +133,7 @@ def __iter__(self): indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples] assert ( len(indices) == self.num_samples - ), f"Indices length {len(indices)} and and sample number {self.num_samples} mismatched" + ), f"Indices length {len(indices)} and sample number {self.num_samples} mismatched" return iter(indices) From 6818ed2d93bea0596640affd09cd98af24843256 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Mon, 3 Aug 2020 19:31:45 +0200 Subject: [PATCH 03/26] fixed AttributeError from DataParallel --- src/transformers/trainer.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 27d24a121ce3..90cb105eda1b 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -555,9 +555,15 @@ def train(self, model_path: Optional[str] = None): model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) - self.non_embedding_flos += 6 * model.floating_point_ops( - *estimate_tokens(inputs), no_embeddings=True - ) + try: + self.non_embedding_flos += 6 * model.floating_point_ops( + *estimate_tokens(inputs), no_embeddings=True + ) + except AttributeError: + # in case this is a DataParallel + self.non_embedding_flos += 6 * model.module.floating_point_ops( + *estimate_tokens(inputs), no_embeddings=True + ) print(self.non_embedding_flos) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( From 53246785e307a593decb005f04ccf0c141db0f5c Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Mon, 3 Aug 2020 19:34:58 +0200 Subject: [PATCH 04/26] removed verbosity --- src/transformers/data/data_collator.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 6ef95f98ce3a..085f7a68a8aa 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -93,9 +93,6 @@ def __call__(self, examples: List[Union[torch.Tensor, Dict[str, torch.Tensor]]]) def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor: length_of_first = examples[0].size(0) are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) - for example in self.tokenizer.batch_decode(examples): - print(example) - print("-" * 89) if are_tensors_same_length: return torch.stack(examples, dim=0) else: From 2636bb8d9f14bc9734afea7c2dd6c0b5603de2da Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Mon, 3 Aug 2020 19:49:55 +0200 Subject: [PATCH 05/26] rotate with use_mtime=True --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 90cb105eda1b..11f91e524c19 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -599,7 +599,7 @@ def train(self, model_path: Optional[str] = None): self.save_model(output_dir) if self.is_world_process_zero(): - self._rotate_checkpoints() + self._rotate_checkpoints(use_mtime=True) if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") From 04e471b4f59cf87529fc905bbe79a2e4fc6b37e4 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Tue, 4 Aug 2020 10:28:00 +0200 Subject: [PATCH 06/26] removed print --- src/transformers/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 11f91e524c19..7fda093d1900 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -564,7 +564,6 @@ def train(self, model_path: Optional[str] = None): self.non_embedding_flos += 6 * model.module.floating_point_ops( *estimate_tokens(inputs), no_embeddings=True ) - print(self.non_embedding_flos) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step From 9e7c05aec5c5f986bad162fad9af2a466de98e05 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Thu, 6 Aug 2020 09:16:09 +0200 Subject: [PATCH 07/26] fixed interaction with gradient accumulation --- src/transformers/trainer.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 2927b36a83b2..de78e4814f02 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -532,6 +532,16 @@ def train(self, model_path: Optional[str] = None): continue tr_loss += self.training_step(model, inputs) + + try: + self.non_embedding_flos += 6 * model.floating_point_ops( + *estimate_tokens(inputs), no_embeddings=True + ) + except AttributeError: + # in case this is a DataParallel + self.non_embedding_flos += 6 * model.module.floating_point_ops( + *estimate_tokens(inputs), no_embeddings=True + ) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps @@ -558,15 +568,6 @@ def train(self, model_path: Optional[str] = None): model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) - try: - self.non_embedding_flos += 6 * model.floating_point_ops( - *estimate_tokens(inputs), no_embeddings=True - ) - except AttributeError: - # in case this is a DataParallel - self.non_embedding_flos += 6 * model.module.floating_point_ops( - *estimate_tokens(inputs), no_embeddings=True - ) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step From 8def613f822c2443a4112fc106ece8d2d86cf3f9 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Fri, 7 Aug 2020 04:17:28 +0200 Subject: [PATCH 08/26] indent formatting --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index de78e4814f02..ddfafd16c995 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -532,7 +532,7 @@ def train(self, model_path: Optional[str] = None): continue tr_loss += self.training_step(model, inputs) - + try: self.non_embedding_flos += 6 * model.floating_point_ops( *estimate_tokens(inputs), no_embeddings=True From 70f919f18f67f68d7c9740f9ee45ee4abc286e15 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Wed, 26 Aug 2020 17:29:45 +0200 Subject: [PATCH 09/26] distributed neflo counting --- src/transformers/trainer.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index a2db2b575a63..a6bee7200162 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -545,6 +545,7 @@ def train(self, model_path: Optional[str] = None): self.global_step = 0 self.epoch = 0 + # Has to be a tensor to be self.non_embedding_flos = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 @@ -728,7 +729,11 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: if self.epoch is not None: logs["epoch"] = self.epoch if self.non_embedding_flos is not None: - logs["non_embedding_flos"] = self.non_embedding_flos + if self.args.local_rank != -1: + gathered_flos = self.distributed_broadcast_scalar(self.non_embedding_flos) + logs["non_embedding_flos"] = gathered_flos.sum().item() + else: + logs["non_embedding_flos"] = self.non_embedding_flos if self.global_step is None: # when logging evaluation metrics without training self.global_step = 0 @@ -902,7 +907,12 @@ def _save_tpu(self, output_dir: Optional[str] = None): xm.rendezvous("saving_checkpoint") # Storing the number of floating-point operations that went into the model - self.model.config.non_embedding_flos = self.non_embedding_flos + if self.non_embedding_flos is not None: + if self.args.local_rank != -1: + gathered_flos = self.distributed_broadcast_scalar(self.non_embedding_flos) + self.model.config.non_embedding_flos = gathered_flos.sum().item() + else: + self.model.config.non_embedding_flos = self.non_embedding_flos self.model.save_pretrained(output_dir) def _save(self, output_dir: Optional[str] = None): @@ -1110,6 +1120,16 @@ def distributed_concat(self, tensor: torch.Tensor, num_total_examples: int) -> t output = concat[:num_total_examples] return output + def distributed_broadcast_scalar(self, scalar: Union[int, float]) -> torch.Tensor: + assert self.args.local_rank != -1 + + tensorized_scalar = torch.Tensor(scalar) + output_tensors = [tensorized_scalar.clone() for _ in range(torch.distributed.get_world_size())] + torch.distributed.all_gather(output_tensors, tensorized_scalar) + concat = torch.cat(output_tensors, dim=0) + + return concat + def prediction_step( self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: From 349e91699a12e2e37e2729ffd9aafdac9e62e6a2 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Wed, 26 Aug 2020 17:32:09 +0200 Subject: [PATCH 10/26] fixed typo --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index a6bee7200162..835c1c73595f 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1123,7 +1123,7 @@ def distributed_concat(self, tensor: torch.Tensor, num_total_examples: int) -> t def distributed_broadcast_scalar(self, scalar: Union[int, float]) -> torch.Tensor: assert self.args.local_rank != -1 - tensorized_scalar = torch.Tensor(scalar) + tensorized_scalar = torch.Tensor([scalar]) output_tensors = [tensorized_scalar.clone() for _ in range(torch.distributed.get_world_size())] torch.distributed.all_gather(output_tensors, tensorized_scalar) concat = torch.cat(output_tensors, dim=0) From 9cc578de2d5bb691758d76c3244ee88aa811615d Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Wed, 26 Aug 2020 17:34:07 +0200 Subject: [PATCH 11/26] fixed typo --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 835c1c73595f..6545384e453c 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1123,7 +1123,7 @@ def distributed_concat(self, tensor: torch.Tensor, num_total_examples: int) -> t def distributed_broadcast_scalar(self, scalar: Union[int, float]) -> torch.Tensor: assert self.args.local_rank != -1 - tensorized_scalar = torch.Tensor([scalar]) + tensorized_scalar = torch.Tensor([scalar]).cuda() output_tensors = [tensorized_scalar.clone() for _ in range(torch.distributed.get_world_size())] torch.distributed.all_gather(output_tensors, tensorized_scalar) concat = torch.cat(output_tensors, dim=0) From 03fe0151587816fe7f72d5ea62df884e97eecab1 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Wed, 26 Aug 2020 19:33:17 +0200 Subject: [PATCH 12/26] mean distributed losses --- src/transformers/trainer.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 6545384e453c..3e0e2555d886 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -730,7 +730,7 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: logs["epoch"] = self.epoch if self.non_embedding_flos is not None: if self.args.local_rank != -1: - gathered_flos = self.distributed_broadcast_scalar(self.non_embedding_flos) + gathered_flos = self.distributed_broadcast_scalars([self.non_embedding_flos]) logs["non_embedding_flos"] = gathered_flos.sum().item() else: logs["non_embedding_flos"] = self.non_embedding_flos @@ -760,7 +760,7 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: experiment = comet_ml.config.get_global_experiment() if experiment is not None: experiment._log_metrics(logs, step=self.global_step, epoch=self.epoch, framework="transformers") - output = {**logs, **{"step": self.global_step, "neFLOs": self.non_embedding_flos}} + output = {**logs, **{"step": self.global_step}} if iterator is not None: iterator.write(output) else: @@ -909,7 +909,7 @@ def _save_tpu(self, output_dir: Optional[str] = None): # Storing the number of floating-point operations that went into the model if self.non_embedding_flos is not None: if self.args.local_rank != -1: - gathered_flos = self.distributed_broadcast_scalar(self.non_embedding_flos) + gathered_flos = self.distributed_broadcast_scalars([self.non_embedding_flos]) self.model.config.non_embedding_flos = gathered_flos.sum().item() else: self.model.config.non_embedding_flos = self.non_embedding_flos @@ -1059,13 +1059,11 @@ def prediction_loop( if self.args.past_index >= 0: self._past = None - samples_count = 0 for inputs in tqdm(dataloader, desc=description): loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only) batch_size = inputs[list(inputs.keys())[0]].shape[0] - samples_count += batch_size if loss is not None: - eval_losses.append(loss * batch_size) + eval_losses.extend([loss] * batch_size) if logits is not None: preds = logits if preds is None else torch.cat((preds, logits), dim=0) if labels is not None: @@ -1099,7 +1097,11 @@ def prediction_loop( else: metrics = {} if len(eval_losses) > 0: - metrics["eval_loss"] = np.sum(eval_losses) / samples_count + if self.args.local_rank != -1: + metrics["eval_loss"] = self.distributed_broadcast_scalars(eval_losses).mean().item() + else: + metrics["eval_loss"] = np.mean(eval_losses) + # Prefix all keys with eval_ for key in list(metrics.keys()): @@ -1120,10 +1122,10 @@ def distributed_concat(self, tensor: torch.Tensor, num_total_examples: int) -> t output = concat[:num_total_examples] return output - def distributed_broadcast_scalar(self, scalar: Union[int, float]) -> torch.Tensor: + def distributed_broadcast_scalars(self, scalars: List[Union[int, float]]) -> torch.Tensor: assert self.args.local_rank != -1 - tensorized_scalar = torch.Tensor([scalar]).cuda() + tensorized_scalar = torch.Tensor(scalars).cuda() output_tensors = [tensorized_scalar.clone() for _ in range(torch.distributed.get_world_size())] torch.distributed.all_gather(output_tensors, tensorized_scalar) concat = torch.cat(output_tensors, dim=0) From fa43ae1222ef1f8699a8127d5d09f1c25dcb726c Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Thu, 27 Aug 2020 17:42:04 +0200 Subject: [PATCH 13/26] exporting log history --- src/transformers/trainer.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 3e0e2555d886..7cc36b039083 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1,4 +1,5 @@ import inspect +import json import logging import math import os @@ -193,6 +194,7 @@ def __init__( self.compute_metrics = compute_metrics self.optimizer, self.lr_scheduler = optimizers self.tb_writer = tb_writer + self.log_history = [] if "prediction_loss_only" in kwargs: warnings.warn( "Passing `prediction_loss_only` as a keyword argument is deprecated and won't be possible in a future version. Use `args.prediction_loss_only` instead.", @@ -761,6 +763,8 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: if experiment is not None: experiment._log_metrics(logs, step=self.global_step, epoch=self.epoch, framework="transformers") output = {**logs, **{"step": self.global_step}} + if self.is_world_process_zero(): + self.log_history.append(output) if iterator is not None: iterator.write(output) else: @@ -899,6 +903,11 @@ def _save_tpu(self, output_dir: Optional[str] = None): if xm.is_master_ordinal(): os.makedirs(output_dir, exist_ok=True) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) + json.dump( + self.log_history, open(os.path.join(output_dir, "log_history.json"), "w"), + indent=2, + ensure_ascii=False + ) # Save a trained model and configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` @@ -929,6 +938,11 @@ def _save(self, output_dir: Optional[str] = None): # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, "training_args.bin")) + json.dump( + self.log_history, open(os.path.join(output_dir, "log_history.json"), "w"), + indent=2, + ensure_ascii=False + ) def _sorted_checkpoints(self, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False) -> List[str]: ordering_and_checkpoint_path = [] @@ -1098,11 +1112,12 @@ def prediction_loop( metrics = {} if len(eval_losses) > 0: if self.args.local_rank != -1: - metrics["eval_loss"] = self.distributed_broadcast_scalars(eval_losses).mean().item() + metrics["eval_loss"] = self.distributed_broadcast_scalars( + eval_losses, num_total_examples=self.num_examples(dataloader) + ).mean().item() else: metrics["eval_loss"] = np.mean(eval_losses) - # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): @@ -1110,19 +1125,20 @@ def prediction_loop( return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) - def distributed_concat(self, tensor: torch.Tensor, num_total_examples: int) -> torch.Tensor: + def distributed_concat(self, tensor: torch.Tensor, num_total_examples: Optional[int] = None) -> torch.Tensor: assert self.args.local_rank != -1 output_tensors = [tensor.clone() for _ in range(torch.distributed.get_world_size())] torch.distributed.all_gather(output_tensors, tensor) - concat = torch.cat(output_tensors, dim=0) # truncate the dummy elements added by SequentialDistributedSampler - output = concat[:num_total_examples] - return output + if num_total_examples is not None: + concat = concat[:num_total_examples] + return concat - def distributed_broadcast_scalars(self, scalars: List[Union[int, float]]) -> torch.Tensor: + def distributed_broadcast_scalars(self, scalars: List[Union[int, float]], num_total_examples: Optional[int] = None)\ + -> torch.Tensor: assert self.args.local_rank != -1 tensorized_scalar = torch.Tensor(scalars).cuda() @@ -1130,6 +1146,9 @@ def distributed_broadcast_scalars(self, scalars: List[Union[int, float]]) -> tor torch.distributed.all_gather(output_tensors, tensorized_scalar) concat = torch.cat(output_tensors, dim=0) + # truncate the dummy elements added by SequentialDistributedSampler + if num_total_examples is not None: + concat = concat[:num_total_examples] return concat def prediction_step( From e7a249faa6cb0d5730e09ea96b5739dd3726cc8f Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Thu, 27 Aug 2020 18:38:37 +0200 Subject: [PATCH 14/26] moved a few functions --- src/transformers/modeling_utils.py | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index d59c20b7a2d9..688db515f16c 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -92,38 +92,6 @@ class ModuleUtilsMixin: A few utilities for :obj:`torch.nn.Modules`, to be used as a mixin. """ - def num_parameters(self, only_trainable: bool = False, no_embeddings: bool = False) -> int: - """ - Get number of (optionally, trainable or non-embeddings) parameters in the module. - - Args: - only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not to return only the number of trainable parameters - - no_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not to return only the number of non-embeddings parameters - - Returns: - :obj:`int`: The number of parameters. - """ - - def parameter_filter(x): - return (x.requires_grad or not only_trainable) and not ( - isinstance(x, torch.nn.Embedding) and no_embeddings - ) - - params = filter(parameter_filter, self.parameters()) if only_trainable else self.parameters() - return sum(p.numel() for p in params) - - def floating_point_ops(self, batch_size: int, sequence_length: int, no_embeddings: bool = False) -> int: - """ - Get number of (optionally, non-embeddings) floating-point operations. Default approximation neglects the - quadratic dependency on the number of tokens (valid if 12 * d_model << sequence_length) as laid out in - https://arxiv.org/pdf/2001.08361.pdf section 2.1. Can be overriden for long-form transformers. - """ - - return 6 * batch_size * sequence_length * self.num_parameters(no_embeddings=no_embeddings) - @staticmethod def _hook_rss_memory_pre_forward(module, *args, **kwargs): try: @@ -329,6 +297,38 @@ def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): head_mask = head_mask.to(dtype=self.dtype) # switch to fload if need + fp16 compatibility return head_mask + def num_parameters(self, only_trainable: bool = False, no_embeddings: bool = False) -> int: + """ + Get number of (optionally, trainable or non-embeddings) parameters in the module. + + Args: + only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return only the number of trainable parameters + + no_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return only the number of non-embeddings parameters + + Returns: + :obj:`int`: The number of parameters. + """ + + def parameter_filter(x): + return (x.requires_grad or not only_trainable) and not ( + isinstance(x, torch.nn.Embedding) and no_embeddings + ) + + params = filter(parameter_filter, self.parameters()) if only_trainable else self.parameters() + return sum(p.numel() for p in params) + + def floating_point_ops(self, batch_size: int, sequence_length: int, no_embeddings: bool = False) -> int: + """ + Get number of (optionally, non-embeddings) floating-point operations. Default approximation neglects the + quadratic dependency on the number of tokens (valid if 12 * d_model << sequence_length) as laid out in + https://arxiv.org/pdf/2001.08361.pdf section 2.1. Can be overriden for long-form transformers. + """ + + return 6 * batch_size * sequence_length * self.num_parameters(no_embeddings=no_embeddings) + class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): r""" From 45f5fcbd0b0a5ab2aa401a135309ce08289bb161 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Thu, 27 Aug 2020 19:04:56 +0200 Subject: [PATCH 15/26] floating_point_ops clarification for transformers with parameter-reuse --- src/transformers/modeling_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 688db515f16c..a9898ad0f102 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -324,7 +324,8 @@ def floating_point_ops(self, batch_size: int, sequence_length: int, no_embedding """ Get number of (optionally, non-embeddings) floating-point operations. Default approximation neglects the quadratic dependency on the number of tokens (valid if 12 * d_model << sequence_length) as laid out in - https://arxiv.org/pdf/2001.08361.pdf section 2.1. Can be overriden for long-form transformers. + https://arxiv.org/pdf/2001.08361.pdf section 2.1. Should be overriden for transformers with parameter re-use + e.g. Albert or Universal Transformers. """ return 6 * batch_size * sequence_length * self.num_parameters(no_embeddings=no_embeddings) From 69d2b1e27345fa06b22c96034a154ce5a79a1032 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Thu, 27 Aug 2020 19:19:14 +0200 Subject: [PATCH 16/26] code quality --- src/transformers/trainer.py | 23 +++++++++++------------ src/transformers/trainer_utils.py | 2 +- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 840e85410577..ba0383dfdac2 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -42,8 +42,8 @@ TrainOutput, default_compute_objective, default_hp_space, + estimate_tokens, set_seed, - estimate_tokens ) from .training_args import TrainingArguments from .utils import logging @@ -1121,9 +1121,7 @@ def _save_tpu(self, output_dir: Optional[str] = None): os.makedirs(output_dir, exist_ok=True) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) json.dump( - self.log_history, open(os.path.join(output_dir, "log_history.json"), "w"), - indent=2, - ensure_ascii=False + self.log_history, open(os.path.join(output_dir, "log_history.json"), "w"), indent=2, ensure_ascii=False ) # Save a trained model and configuration using `save_pretrained()`. @@ -1160,9 +1158,7 @@ def _save(self, output_dir: Optional[str] = None): # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, "training_args.bin")) json.dump( - self.log_history, open(os.path.join(output_dir, "log_history.json"), "w"), - indent=2, - ensure_ascii=False + self.log_history, open(os.path.join(output_dir, "log_history.json"), "w"), indent=2, ensure_ascii=False ) def _sorted_checkpoints(self, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False) -> List[str]: @@ -1334,9 +1330,11 @@ def prediction_loop( metrics = {} if len(eval_losses) > 0: if self.args.local_rank != -1: - metrics["eval_loss"] = self.distributed_broadcast_scalars( - eval_losses, num_total_examples=self.num_examples(dataloader) - ).mean().item() + metrics["eval_loss"] = ( + self.distributed_broadcast_scalars(eval_losses, num_total_examples=self.num_examples(dataloader)) + .mean() + .item() + ) else: metrics["eval_loss"] = np.mean(eval_losses) @@ -1359,8 +1357,9 @@ def distributed_concat(self, tensor: torch.Tensor, num_total_examples: Optional[ concat = concat[:num_total_examples] return concat - def distributed_broadcast_scalars(self, scalars: List[Union[int, float]], num_total_examples: Optional[int] = None)\ - -> torch.Tensor: + def distributed_broadcast_scalars( + self, scalars: List[Union[int, float]], num_total_examples: Optional[int] = None + ) -> torch.Tensor: assert self.args.local_rank != -1 tensorized_scalar = torch.Tensor(scalars).cuda() diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 2dfb6129fec6..5fac84cc9b37 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -1,6 +1,6 @@ import random -from typing import Any, Dict, NamedTuple, Optional import warnings +from typing import Any, Dict, NamedTuple, Optional import numpy as np From d796eef038b976b8435ccb6d48db690a48f19732 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Thu, 27 Aug 2020 19:22:20 +0200 Subject: [PATCH 17/26] double import --- src/transformers/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index ba0383dfdac2..1bb64733a26d 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1,6 +1,5 @@ import inspect import json -import logging import math import os import re From c1751424b3ec900f0d0882816dd6d63f7fa569de Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Fri, 28 Aug 2020 17:10:22 +0200 Subject: [PATCH 18/26] made flo estimation more task-agnostic --- src/transformers/modeling_utils.py | 124 ++++++++++++++++++----------- src/transformers/trainer.py | 71 ++++++++++++----- src/transformers/trainer_utils.py | 22 ----- 3 files changed, 129 insertions(+), 88 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index dc2fd383ad49..3890f69416b1 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -18,7 +18,8 @@ import os import re from dataclasses import dataclass -from typing import Callable, Dict, List, Optional, Set, Tuple, Union +from typing import Callable, Dict, List, Optional, Set, Tuple, Union, Any +import warnings import torch from torch import Tensor, device, dtype, nn @@ -42,10 +43,8 @@ from .generation_utils import GenerationMixin from .utils import logging - logger = logging.get_logger(__name__) - try: from torch.nn import Identity except ImportError: @@ -61,7 +60,7 @@ def forward(self, input): def find_pruneable_heads_and_indices( - heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int] + heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int] ) -> Tuple[Set[int], torch.LongTensor]: """ Finds the heads and their indices taking :obj:`already_pruned_heads` into account. @@ -259,7 +258,7 @@ def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple return extended_attention_mask def get_head_mask( - self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False + self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False ) -> Tensor: """ Prepare the head mask if needed. @@ -293,7 +292,7 @@ def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" - head_mask = head_mask.to(dtype=self.dtype) # switch to fload if need + fp16 compatibility + head_mask = head_mask.to(dtype=self.dtype) # switch to float if need + fp16 compatibility return head_mask def num_parameters(self, only_trainable: bool = False, no_embeddings: bool = False) -> int: @@ -313,21 +312,56 @@ def num_parameters(self, only_trainable: bool = False, no_embeddings: bool = Fal def parameter_filter(x): return (x.requires_grad or not only_trainable) and not ( - isinstance(x, torch.nn.Embedding) and no_embeddings + isinstance(x, torch.nn.Embedding) and no_embeddings ) params = filter(parameter_filter, self.parameters()) if only_trainable else self.parameters() return sum(p.numel() for p in params) - def floating_point_ops(self, batch_size: int, sequence_length: int, no_embeddings: bool = False) -> int: + def estimate_tokens(self, input_dict: Dict[str, Union[torch.Tensor, Any]]): """ - Get number of (optionally, non-embeddings) floating-point operations. Default approximation neglects the - quadratic dependency on the number of tokens (valid if 12 * d_model << sequence_length) as laid out in - https://arxiv.org/pdf/2001.08361.pdf section 2.1. Should be overriden for transformers with parameter re-use - e.g. Albert or Universal Transformers. + Helper function to estimate the batch size and sequence length from the model inputs. Returned batch size is the + first dimension of input tensors, returned sequence length is the sum of the second dimensions of all input + tensors. + + Args: + inputs (:obj:`dict`): The model inputs. + + Returns: + seed (:obj:`tuple`): The batch size and sequence length. + """ + token_inputs = [tensor for key, tensor in input_dict.items() if "input" in key] + if token_inputs: + return sum([token_input.numel() for token_input in token_inputs]) + else: + warnings.warn( + "Could not estimate the number of tokens of the input, floating-point operations will not be computed" + ) + return 0 + + def floating_point_ops(self, input_dict: Dict[str, Union[torch.Tensor, Any]], no_embeddings: bool = True) -> int: + """ + Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a + batch with this transformer model. Default approximation neglects the quadratic dependency on the number of + tokens (valid if 12 * d_model << sequence_length) as laid out in https://arxiv.org/pdf/2001.08361.pdf section + 2.1. Should be overriden for transformers with parameter re-use e.g. Albert or Universal Transformers, or + if doing long-range modeling with very high sequence lengths. + + Args: + batch_size (:obj:`int`): + The batch size for the forward pass. + + sequence_length (:obj:`int`): + The number of tokens in each line of the batch. + + no_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to count embedding and softmax operations. + + Returns: + :obj:`int`: The number of floating-point operations. """ - return 6 * batch_size * sequence_length * self.num_parameters(no_embeddings=no_embeddings) + return 6 * self.estimate_tokens(input_dict) * self.num_parameters(no_embeddings=no_embeddings) class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): @@ -443,11 +477,11 @@ def _tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_mo assert decoder.__class__ == encoder.__class__, f"{decoder.__class__} and {encoder.__class__} have to be equal." def tie_encoder_to_decoder_recursively( - decoder_pointer: nn.Module, - encoder_pointer: nn.Module, - module_name: str, - uninitialized_encoder_weights: List[str], - depth=0, + decoder_pointer: nn.Module, + encoder_pointer: nn.Module, + module_name: str, + uninitialized_encoder_weights: List[str], + depth=0, ): assert isinstance(decoder_pointer, nn.Module) and isinstance( encoder_pointer, nn.Module @@ -464,7 +498,7 @@ def tie_encoder_to_decoder_recursively( decoder_modules = decoder_pointer._modules if len(decoder_modules) > 0: assert ( - len(encoder_modules) > 0 + len(encoder_modules) > 0 ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}" all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()]) @@ -562,7 +596,7 @@ def _resize_token_embeddings(self, new_num_tokens): return self.get_input_embeddings() def _get_resized_embeddings( - self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None + self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None ) -> torch.nn.Embedding: """ Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly @@ -1054,7 +1088,7 @@ def __init__(self, config: PretrainedConfig): self.dense = nn.Linear(config.hidden_size, 1) def forward( - self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None + self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None ) -> torch.FloatTensor: """ Args: @@ -1096,11 +1130,11 @@ def __init__(self, config: PretrainedConfig): self.dense_1 = nn.Linear(config.hidden_size, 1) def forward( - self, - hidden_states: torch.FloatTensor, - start_states: Optional[torch.FloatTensor] = None, - start_positions: Optional[torch.LongTensor] = None, - p_mask: Optional[torch.FloatTensor] = None, + self, + hidden_states: torch.FloatTensor, + start_states: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + p_mask: Optional[torch.FloatTensor] = None, ) -> torch.FloatTensor: """ Args: @@ -1123,7 +1157,7 @@ def forward( :obj:`torch.FloatTensor`: The end logits for SQuAD. """ assert ( - start_states is not None or start_positions is not None + start_states is not None or start_positions is not None ), "One of start_states, start_positions should be not None" if start_positions is not None: slen, hsz = hidden_states.shape[-2:] @@ -1161,11 +1195,11 @@ def __init__(self, config): self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) def forward( - self, - hidden_states: torch.FloatTensor, - start_states: Optional[torch.FloatTensor] = None, - start_positions: Optional[torch.LongTensor] = None, - cls_index: Optional[torch.LongTensor] = None, + self, + hidden_states: torch.FloatTensor, + start_states: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + cls_index: Optional[torch.LongTensor] = None, ) -> torch.FloatTensor: """ Args: @@ -1189,7 +1223,7 @@ def forward( # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample. hsz = hidden_states.shape[-1] assert ( - start_states is not None or start_positions is not None + start_states is not None or start_positions is not None ), "One of start_states, start_positions should be not None" if start_positions is not None: start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) @@ -1258,14 +1292,14 @@ def __init__(self, config): @replace_return_docstrings(output_type=SquadHeadOutput, config_class=PretrainedConfig) def forward( - self, - hidden_states: torch.FloatTensor, - start_positions: Optional[torch.LongTensor] = None, - end_positions: Optional[torch.LongTensor] = None, - cls_index: Optional[torch.LongTensor] = None, - is_impossible: Optional[torch.LongTensor] = None, - p_mask: Optional[torch.FloatTensor] = None, - return_dict: bool = False, + self, + hidden_states: torch.FloatTensor, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + cls_index: Optional[torch.LongTensor] = None, + is_impossible: Optional[torch.LongTensor] = None, + p_mask: Optional[torch.FloatTensor] = None, + return_dict: bool = False, ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]: """ Args: @@ -1412,7 +1446,7 @@ def __init__(self, config: PretrainedConfig): self.last_dropout = nn.Dropout(config.summary_last_dropout) def forward( - self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None + self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None ) -> torch.FloatTensor: """ Compute a single vector summary of a sequence hidden states. @@ -1524,7 +1558,7 @@ def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) -> def prune_layer( - layer: Union[torch.nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None + layer: Union[torch.nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None ) -> Union[torch.nn.Linear, Conv1D]: """ Prune a Conv1D or linear layer to keep only entries in index. @@ -1549,7 +1583,7 @@ def prune_layer( def apply_chunking_to_forward( - forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors + forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors ) -> torch.Tensor: """ This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the @@ -1599,7 +1633,7 @@ def forward(self, hidden_states): if chunk_size > 0: assert ( - input_tensors[0].shape[chunk_dim] % chunk_size == 0 + input_tensors[0].shape[chunk_dim] % chunk_size == 0 ), "The dimension to be chunked {} has to be a multiple of the chunk size {}".format( input_tensors[0].shape[chunk_dim], chunk_size ) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 1bb64733a26d..f9153b74f910 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -41,7 +41,6 @@ TrainOutput, default_compute_objective, default_hp_space, - estimate_tokens, set_seed, ) from .training_args import TrainingArguments @@ -295,6 +294,7 @@ def __init__( self.global_step = None self.epoch = None + self.total_flos = None if self.args.fp16 and _use_native_amp: self.scaler = torch.cuda.amp.GradScaler() self.hp_search_backend = None @@ -471,7 +471,11 @@ def setup_wandb(self): logger.info( 'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"' ) - combined_dict = {**self.model.config.to_dict(), **self.args.to_sanitized_dict()} + try: + combined_dict = {**self.model.config.to_dict(), **self.args.to_sanitized_dict()} + except AttributeError: + # in case the model has no config + combined_dict = {**self.args.to_sanitized_dict()} wandb.init( project=os.getenv("WANDB_PROJECT", "huggingface"), config=combined_dict, name=self.args.run_name ) @@ -641,7 +645,7 @@ def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", D self.global_step = 0 self.epoch = 0 - self.non_embedding_flos = 0 + self.total_flos = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint @@ -649,7 +653,7 @@ def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", D # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) - self.non_embedding_flos = getattr(model.config, "non_embedding_flos", 0) + self.total_flos = getattr(model.config, "total_flos", 0) epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps @@ -659,12 +663,12 @@ def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", D logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info( - " Continuing training from %d non-embedding floating-point operations", self.non_embedding_flos + " Continuing training from %d non-embedding floating-point operations", self.total_flos ) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 - self.non_embedding_flos = 0 + self.total_flos = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 @@ -700,14 +704,10 @@ def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", D tr_loss += self.training_step(model, inputs) try: - self.non_embedding_flos += 6 * model.floating_point_ops( - *estimate_tokens(inputs), no_embeddings=True - ) + self.total_flos += self.floating_point_ops(model, inputs) except AttributeError: # in case this is a DataParallel - self.non_embedding_flos += 6 * model.module.floating_point_ops( - *estimate_tokens(inputs), no_embeddings=True - ) + self.total_flos += self.floating_point_ops(model.module, inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps @@ -946,12 +946,12 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: if self.epoch is not None: logs["epoch"] = self.epoch - if self.non_embedding_flos is not None: + if self.total_flos is not None: if self.args.local_rank != -1: - gathered_flos = self.distributed_broadcast_scalars([self.non_embedding_flos]) - logs["non_embedding_flos"] = gathered_flos.sum().item() + gathered_flos = self.distributed_broadcast_scalars([self.total_flos]) + logs["total_flos"] = gathered_flos.sum().item() else: - logs["non_embedding_flos"] = self.non_embedding_flos + logs["total_flos"] = self.total_flos if self.global_step is None: # when logging evaluation metrics without training self.global_step = 0 @@ -1130,12 +1130,13 @@ def _save_tpu(self, output_dir: Optional[str] = None): xm.rendezvous("saving_checkpoint") # Storing the number of floating-point operations that went into the model - if self.non_embedding_flos is not None: + if self.total_flos is not None: if self.args.local_rank != -1: - gathered_flos = self.distributed_broadcast_scalars([self.non_embedding_flos]) - self.model.config.non_embedding_flos = gathered_flos.sum().item() + total_flos = self.distributed_broadcast_scalars([self.total_flos]).sum().item() else: - self.model.config.non_embedding_flos = self.non_embedding_flos + total_flos = self.total_flos + if total_flos > 0: + self.model.config.total_flos = total_flos self.model.save_pretrained(output_dir) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) @@ -1149,7 +1150,13 @@ def _save(self, output_dir: Optional[str] = None): if not isinstance(self.model, PreTrainedModel): raise ValueError("Trainer.model appears to not be a PreTrainedModel") # Storing the number of floating-point operations that went into the model - self.model.config.non_embedding_flos = self.non_embedding_flos + if self.total_flos is not None: + if self.args.local_rank != -1: + total_flos = self.distributed_broadcast_scalars([self.total_flos]).sum().item() + else: + total_flos = self.total_flos + if total_flos > 0: + self.model.config.total_flos = total_flos self.model.save_pretrained(output_dir) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) @@ -1416,3 +1423,25 @@ def prediction_step( if labels is not None: labels = labels.detach() return (loss, logits.detach(), labels) + + def floating_point_ops(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]): + """ + For models with a `floating_point_ops` method (e.g. models that inherit from `PretrainedModel`), uses + that method to compute the number of floating point operations for every backward + forward pass. If using + another model, either implement such a method in the model or override this method. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + Returns: + :obj:`int`: The number of floating-point operations. + """ + + if hasattr(model, "floating_point_ops"): + return model.floating_point_ops(inputs) + + else: + return 0 diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 5fac84cc9b37..c61b91138eff 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -31,28 +31,6 @@ def set_seed(seed: int): tf.random.set_seed(seed) -def estimate_tokens(inputs): - """ - Helper function to estimate the batch size and sequence length from the model inputs - - Args: - inputs (:obj:`dict`): The model inputs. - - Returns: - seed (:obj:`tuple`): The batch size and sequence length. - """ - inputs_ids = inputs.get("input_ids") - input_embeds = inputs.get("input_embeds") - if inputs is not None: - return inputs_ids.shape[0], inputs_ids.shape[1] - if input_embeds is not None: - return input_embeds.shape[0], input_embeds.shape[1] - warnings.warn( - "Could not estimate the number of tokens of the input, floating-point operations will" "not be computed" - ) - return 0, 0 - - class EvalPrediction(NamedTuple): """ Evaluation output (always contains labels), to be used to compute metrics. From 1773dd643b62f990082897eab1138dac217f0394 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Fri, 28 Aug 2020 17:14:03 +0200 Subject: [PATCH 19/26] only logging flos if computed --- src/transformers/trainer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index f9153b74f910..75968f5c686d 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -948,9 +948,10 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: logs["epoch"] = self.epoch if self.total_flos is not None: if self.args.local_rank != -1: - gathered_flos = self.distributed_broadcast_scalars([self.total_flos]) - logs["total_flos"] = gathered_flos.sum().item() + total_flos = self.distributed_broadcast_scalars([self.total_flos]).sum().item() else: + total_flos = self.total_flos + if total_flos > 0: logs["total_flos"] = self.total_flos if self.global_step is None: # when logging evaluation metrics without training @@ -1135,8 +1136,8 @@ def _save_tpu(self, output_dir: Optional[str] = None): total_flos = self.distributed_broadcast_scalars([self.total_flos]).sum().item() else: total_flos = self.total_flos - if total_flos > 0: - self.model.config.total_flos = total_flos + if total_flos > 0: + self.model.config.total_flos = total_flos self.model.save_pretrained(output_dir) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) @@ -1155,8 +1156,8 @@ def _save(self, output_dir: Optional[str] = None): total_flos = self.distributed_broadcast_scalars([self.total_flos]).sum().item() else: total_flos = self.total_flos - if total_flos > 0: - self.model.config.total_flos = total_flos + if total_flos > 0: + self.model.config.total_flos = total_flos self.model.save_pretrained(output_dir) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) From 4610852c1bc5c352fd095ec7556662904ea04c13 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Fri, 28 Aug 2020 17:14:38 +0200 Subject: [PATCH 20/26] code quality --- src/transformers/modeling_utils.py | 75 +++++++++++++++--------------- src/transformers/trainer.py | 4 +- 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 3890f69416b1..d0f81cae5842 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -17,9 +17,9 @@ import inspect import os import re -from dataclasses import dataclass -from typing import Callable, Dict, List, Optional, Set, Tuple, Union, Any import warnings +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union import torch from torch import Tensor, device, dtype, nn @@ -43,6 +43,7 @@ from .generation_utils import GenerationMixin from .utils import logging + logger = logging.get_logger(__name__) try: @@ -60,7 +61,7 @@ def forward(self, input): def find_pruneable_heads_and_indices( - heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int] + heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int] ) -> Tuple[Set[int], torch.LongTensor]: """ Finds the heads and their indices taking :obj:`already_pruned_heads` into account. @@ -258,7 +259,7 @@ def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple return extended_attention_mask def get_head_mask( - self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False + self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False ) -> Tensor: """ Prepare the head mask if needed. @@ -312,7 +313,7 @@ def num_parameters(self, only_trainable: bool = False, no_embeddings: bool = Fal def parameter_filter(x): return (x.requires_grad or not only_trainable) and not ( - isinstance(x, torch.nn.Embedding) and no_embeddings + isinstance(x, torch.nn.Embedding) and no_embeddings ) params = filter(parameter_filter, self.parameters()) if only_trainable else self.parameters() @@ -477,11 +478,11 @@ def _tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_mo assert decoder.__class__ == encoder.__class__, f"{decoder.__class__} and {encoder.__class__} have to be equal." def tie_encoder_to_decoder_recursively( - decoder_pointer: nn.Module, - encoder_pointer: nn.Module, - module_name: str, - uninitialized_encoder_weights: List[str], - depth=0, + decoder_pointer: nn.Module, + encoder_pointer: nn.Module, + module_name: str, + uninitialized_encoder_weights: List[str], + depth=0, ): assert isinstance(decoder_pointer, nn.Module) and isinstance( encoder_pointer, nn.Module @@ -498,7 +499,7 @@ def tie_encoder_to_decoder_recursively( decoder_modules = decoder_pointer._modules if len(decoder_modules) > 0: assert ( - len(encoder_modules) > 0 + len(encoder_modules) > 0 ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}" all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()]) @@ -596,7 +597,7 @@ def _resize_token_embeddings(self, new_num_tokens): return self.get_input_embeddings() def _get_resized_embeddings( - self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None + self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None ) -> torch.nn.Embedding: """ Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly @@ -1088,7 +1089,7 @@ def __init__(self, config: PretrainedConfig): self.dense = nn.Linear(config.hidden_size, 1) def forward( - self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None + self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None ) -> torch.FloatTensor: """ Args: @@ -1130,11 +1131,11 @@ def __init__(self, config: PretrainedConfig): self.dense_1 = nn.Linear(config.hidden_size, 1) def forward( - self, - hidden_states: torch.FloatTensor, - start_states: Optional[torch.FloatTensor] = None, - start_positions: Optional[torch.LongTensor] = None, - p_mask: Optional[torch.FloatTensor] = None, + self, + hidden_states: torch.FloatTensor, + start_states: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + p_mask: Optional[torch.FloatTensor] = None, ) -> torch.FloatTensor: """ Args: @@ -1157,7 +1158,7 @@ def forward( :obj:`torch.FloatTensor`: The end logits for SQuAD. """ assert ( - start_states is not None or start_positions is not None + start_states is not None or start_positions is not None ), "One of start_states, start_positions should be not None" if start_positions is not None: slen, hsz = hidden_states.shape[-2:] @@ -1195,11 +1196,11 @@ def __init__(self, config): self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) def forward( - self, - hidden_states: torch.FloatTensor, - start_states: Optional[torch.FloatTensor] = None, - start_positions: Optional[torch.LongTensor] = None, - cls_index: Optional[torch.LongTensor] = None, + self, + hidden_states: torch.FloatTensor, + start_states: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + cls_index: Optional[torch.LongTensor] = None, ) -> torch.FloatTensor: """ Args: @@ -1223,7 +1224,7 @@ def forward( # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample. hsz = hidden_states.shape[-1] assert ( - start_states is not None or start_positions is not None + start_states is not None or start_positions is not None ), "One of start_states, start_positions should be not None" if start_positions is not None: start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) @@ -1292,14 +1293,14 @@ def __init__(self, config): @replace_return_docstrings(output_type=SquadHeadOutput, config_class=PretrainedConfig) def forward( - self, - hidden_states: torch.FloatTensor, - start_positions: Optional[torch.LongTensor] = None, - end_positions: Optional[torch.LongTensor] = None, - cls_index: Optional[torch.LongTensor] = None, - is_impossible: Optional[torch.LongTensor] = None, - p_mask: Optional[torch.FloatTensor] = None, - return_dict: bool = False, + self, + hidden_states: torch.FloatTensor, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + cls_index: Optional[torch.LongTensor] = None, + is_impossible: Optional[torch.LongTensor] = None, + p_mask: Optional[torch.FloatTensor] = None, + return_dict: bool = False, ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]: """ Args: @@ -1446,7 +1447,7 @@ def __init__(self, config: PretrainedConfig): self.last_dropout = nn.Dropout(config.summary_last_dropout) def forward( - self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None + self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None ) -> torch.FloatTensor: """ Compute a single vector summary of a sequence hidden states. @@ -1558,7 +1559,7 @@ def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) -> def prune_layer( - layer: Union[torch.nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None + layer: Union[torch.nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None ) -> Union[torch.nn.Linear, Conv1D]: """ Prune a Conv1D or linear layer to keep only entries in index. @@ -1583,7 +1584,7 @@ def prune_layer( def apply_chunking_to_forward( - forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors + forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors ) -> torch.Tensor: """ This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the @@ -1633,7 +1634,7 @@ def forward(self, hidden_states): if chunk_size > 0: assert ( - input_tensors[0].shape[chunk_dim] % chunk_size == 0 + input_tensors[0].shape[chunk_dim] % chunk_size == 0 ), "The dimension to be chunked {} has to be a multiple of the chunk size {}".format( input_tensors[0].shape[chunk_dim], chunk_size ) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 75968f5c686d..36816bab6717 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -662,9 +662,7 @@ def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", D logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) - logger.info( - " Continuing training from %d non-embedding floating-point operations", self.total_flos - ) + logger.info(" Continuing training from %d non-embedding floating-point operations", self.total_flos) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 From fae5254928dcb7b87a843d6579c2ec7d0478f3d0 Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Fri, 28 Aug 2020 17:17:27 +0200 Subject: [PATCH 21/26] unused import --- src/transformers/trainer_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index c61b91138eff..d5556f16c3a7 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -1,5 +1,4 @@ import random -import warnings from typing import Any, Dict, NamedTuple, Optional import numpy as np From 6f1b48c1e367a48a72f93e2398ebf864de1ec4b2 Mon Sep 17 00:00:00 2001 From: Teven Date: Mon, 31 Aug 2020 16:08:45 +0200 Subject: [PATCH 22/26] Update src/transformers/trainer.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 36816bab6717..9865216731a5 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1425,7 +1425,7 @@ def prediction_step( def floating_point_ops(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]): """ - For models with a `floating_point_ops` method (e.g. models that inherit from `PretrainedModel`), uses + For models that inherit from :class:`~transformers.PretrainedModel`, uses that method to compute the number of floating point operations for every backward + forward pass. If using another model, either implement such a method in the model or override this method. From 304ebe8570ac7e7451944631489f651e021f16ce Mon Sep 17 00:00:00 2001 From: Teven Date: Mon, 31 Aug 2020 16:08:57 +0200 Subject: [PATCH 23/26] Update src/transformers/modeling_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index d0f81cae5842..7e6ca304fd57 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -329,7 +329,7 @@ def estimate_tokens(self, input_dict: Dict[str, Union[torch.Tensor, Any]]): inputs (:obj:`dict`): The model inputs. Returns: - seed (:obj:`tuple`): The batch size and sequence length. + :obj:`Tuple[int, int]`: The batch size and sequence length. """ token_inputs = [tensor for key, tensor in input_dict.items() if "input" in key] if token_inputs: From 8ec3ea6a627096a429f6031e733eff2cbaf143ce Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Mon, 31 Aug 2020 16:33:19 +0200 Subject: [PATCH 24/26] Sylvain review --- src/transformers/modeling_utils.py | 22 ++++---- src/transformers/trainer.py | 83 ++++++++++-------------------- src/transformers/trainer_utils.py | 32 +++++++++++- 3 files changed, 70 insertions(+), 67 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 7e6ca304fd57..4669e5a89b31 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -296,7 +296,7 @@ def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): head_mask = head_mask.to(dtype=self.dtype) # switch to float if need + fp16 compatibility return head_mask - def num_parameters(self, only_trainable: bool = False, no_embeddings: bool = False) -> int: + def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int: """ Get number of (optionally, trainable or non-embeddings) parameters in the module. @@ -304,7 +304,7 @@ def num_parameters(self, only_trainable: bool = False, no_embeddings: bool = Fal only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to return only the number of trainable parameters - no_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`): + exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to return only the number of non-embeddings parameters Returns: @@ -313,23 +313,21 @@ def num_parameters(self, only_trainable: bool = False, no_embeddings: bool = Fal def parameter_filter(x): return (x.requires_grad or not only_trainable) and not ( - isinstance(x, torch.nn.Embedding) and no_embeddings + isinstance(x, torch.nn.Embedding) and exclude_embeddings ) params = filter(parameter_filter, self.parameters()) if only_trainable else self.parameters() return sum(p.numel() for p in params) - def estimate_tokens(self, input_dict: Dict[str, Union[torch.Tensor, Any]]): + def estimate_tokens(self, input_dict: Dict[str, Union[torch.Tensor, Any]]) -> int: """ - Helper function to estimate the batch size and sequence length from the model inputs. Returned batch size is the - first dimension of input tensors, returned sequence length is the sum of the second dimensions of all input - tensors. + Helper function to estimate the total number of tokens from the model inputs. Args: inputs (:obj:`dict`): The model inputs. Returns: - :obj:`Tuple[int, int]`: The batch size and sequence length. + :obj:`int`: The total number of tokens. """ token_inputs = [tensor for key, tensor in input_dict.items() if "input" in key] if token_inputs: @@ -340,7 +338,9 @@ def estimate_tokens(self, input_dict: Dict[str, Union[torch.Tensor, Any]]): ) return 0 - def floating_point_ops(self, input_dict: Dict[str, Union[torch.Tensor, Any]], no_embeddings: bool = True) -> int: + def floating_point_ops( + self, input_dict: Dict[str, Union[torch.Tensor, Any]], exclude_embeddings: bool = True + ) -> int: """ Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a batch with this transformer model. Default approximation neglects the quadratic dependency on the number of @@ -355,14 +355,14 @@ def floating_point_ops(self, input_dict: Dict[str, Union[torch.Tensor, Any]], no sequence_length (:obj:`int`): The number of tokens in each line of the batch. - no_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`): + exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to count embedding and softmax operations. Returns: :obj:`int`: The number of floating-point operations. """ - return 6 * self.estimate_tokens(input_dict) * self.num_parameters(no_embeddings=no_embeddings) + return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings) class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 9865216731a5..740b59e7256a 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -41,6 +41,8 @@ TrainOutput, default_compute_objective, default_hp_space, + distributed_broadcast_scalars, + distributed_concat, set_seed, ) from .training_args import TrainingArguments @@ -700,12 +702,7 @@ def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", D continue tr_loss += self.training_step(model, inputs) - - try: - self.total_flos += self.floating_point_ops(model, inputs) - except AttributeError: - # in case this is a DataParallel - self.total_flos += self.floating_point_ops(model.module, inputs) + self.total_flos += self.floating_point_ops(inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps @@ -946,7 +943,7 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: logs["epoch"] = self.epoch if self.total_flos is not None: if self.args.local_rank != -1: - total_flos = self.distributed_broadcast_scalars([self.total_flos]).sum().item() + total_flos = distributed_broadcast_scalars([self.total_flos]).sum().item() else: total_flos = self.total_flos if total_flos > 0: @@ -1128,14 +1125,7 @@ def _save_tpu(self, output_dir: Optional[str] = None): raise ValueError("Trainer.model appears to not be a PreTrainedModel") xm.rendezvous("saving_checkpoint") - # Storing the number of floating-point operations that went into the model - if self.total_flos is not None: - if self.args.local_rank != -1: - total_flos = self.distributed_broadcast_scalars([self.total_flos]).sum().item() - else: - total_flos = self.total_flos - if total_flos > 0: - self.model.config.total_flos = total_flos + self._store_flos() self.model.save_pretrained(output_dir) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) @@ -1148,14 +1138,7 @@ def _save(self, output_dir: Optional[str] = None): # They can then be reloaded using `from_pretrained()` if not isinstance(self.model, PreTrainedModel): raise ValueError("Trainer.model appears to not be a PreTrainedModel") - # Storing the number of floating-point operations that went into the model - if self.total_flos is not None: - if self.args.local_rank != -1: - total_flos = self.distributed_broadcast_scalars([self.total_flos]).sum().item() - else: - total_flos = self.total_flos - if total_flos > 0: - self.model.config.total_flos = total_flos + self._store_flos() self.model.save_pretrained(output_dir) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) @@ -1166,6 +1149,16 @@ def _save(self, output_dir: Optional[str] = None): self.log_history, open(os.path.join(output_dir, "log_history.json"), "w"), indent=2, ensure_ascii=False ) + def _store_flos(self): + # Storing the number of floating-point operations that went into the model + if self.total_flos is not None: + if self.args.local_rank != -1: + total_flos = distributed_broadcast_scalars([self.total_flos]).sum().item() + else: + total_flos = self.total_flos + if total_flos > 0: + self.model.config.total_flos = total_flos + def _sorted_checkpoints(self, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False) -> List[str]: ordering_and_checkpoint_path = [] @@ -1313,9 +1306,9 @@ def prediction_loop( if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: - preds = self.distributed_concat(preds, num_total_examples=self.num_examples(dataloader)) + preds = distributed_concat(preds, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: - label_ids = self.distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader)) + label_ids = distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader)) elif is_torch_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset if preds is not None: @@ -1336,7 +1329,7 @@ def prediction_loop( if len(eval_losses) > 0: if self.args.local_rank != -1: metrics["eval_loss"] = ( - self.distributed_broadcast_scalars(eval_losses, num_total_examples=self.num_examples(dataloader)) + distributed_broadcast_scalars(eval_losses, num_total_examples=self.num_examples(dataloader)) .mean() .item() ) @@ -1350,33 +1343,6 @@ def prediction_loop( return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) - def distributed_concat(self, tensor: torch.Tensor, num_total_examples: Optional[int] = None) -> torch.Tensor: - assert self.args.local_rank != -1 - - output_tensors = [tensor.clone() for _ in range(torch.distributed.get_world_size())] - torch.distributed.all_gather(output_tensors, tensor) - concat = torch.cat(output_tensors, dim=0) - - # truncate the dummy elements added by SequentialDistributedSampler - if num_total_examples is not None: - concat = concat[:num_total_examples] - return concat - - def distributed_broadcast_scalars( - self, scalars: List[Union[int, float]], num_total_examples: Optional[int] = None - ) -> torch.Tensor: - assert self.args.local_rank != -1 - - tensorized_scalar = torch.Tensor(scalars).cuda() - output_tensors = [tensorized_scalar.clone() for _ in range(torch.distributed.get_world_size())] - torch.distributed.all_gather(output_tensors, tensorized_scalar) - concat = torch.cat(output_tensors, dim=0) - - # truncate the dummy elements added by SequentialDistributedSampler - if num_total_examples is not None: - concat = concat[:num_total_examples] - return concat - def prediction_step( self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -1423,11 +1389,11 @@ def prediction_step( labels = labels.detach() return (loss, logits.detach(), labels) - def floating_point_ops(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]): + def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]): """ For models that inherit from :class:`~transformers.PretrainedModel`, uses that method to compute the number of floating point operations for every backward + forward pass. If using - another model, either implement such a method in the model or override this method. + another model, either implement such a method in the model or subclass and override this method. Args: model (:obj:`nn.Module`): @@ -1439,6 +1405,13 @@ def floating_point_ops(self, model: nn.Module, inputs: Dict[str, Union[torch.Ten :obj:`int`: The number of floating-point operations. """ + if isinstance(self.model, torch.nn.DataParallel) or isinstance( + self.model, torch.nn.parallel.DistributedDataParallel + ): + model = self.model.module + else: + model = self.model + if hasattr(model, "floating_point_ops"): return model.floating_point_ops(inputs) diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index d5556f16c3a7..615e59305f0e 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -1,7 +1,8 @@ import random -from typing import Any, Dict, NamedTuple, Optional +from typing import Any, Dict, List, NamedTuple, Optional, Union import numpy as np +import torch from .file_utils import is_tf_available, is_torch_available from .integrations import is_ray_available @@ -122,3 +123,32 @@ class HPSearchBackend(ExplicitEnum): HPSearchBackend.OPTUNA: default_hp_space_optuna, HPSearchBackend.RAY: default_hp_space_ray, } + + +def distributed_concat(self, tensor: torch.Tensor, num_total_examples: Optional[int] = None) -> torch.Tensor: + assert self.args.local_rank != -1 + + output_tensors = [tensor.clone() for _ in range(torch.distributed.get_world_size())] + torch.distributed.all_gather(output_tensors, tensor) + concat = torch.cat(output_tensors, dim=0) + + # truncate the dummy elements added by SequentialDistributedSampler + if num_total_examples is not None: + concat = concat[:num_total_examples] + return concat + + +def distributed_broadcast_scalars( + self, scalars: List[Union[int, float]], num_total_examples: Optional[int] = None +) -> torch.Tensor: + assert self.args.local_rank != -1 + + tensorized_scalar = torch.Tensor(scalars).cuda() + output_tensors = [tensorized_scalar.clone() for _ in range(torch.distributed.get_world_size())] + torch.distributed.all_gather(output_tensors, tensorized_scalar) + concat = torch.cat(output_tensors, dim=0) + + # truncate the dummy elements added by SequentialDistributedSampler + if num_total_examples is not None: + concat = concat[:num_total_examples] + return concat From 4becfacaa258dfbb7b602b056a5370442dff2d0d Mon Sep 17 00:00:00 2001 From: Teven Date: Mon, 31 Aug 2020 16:40:28 +0200 Subject: [PATCH 25/26] Update src/transformers/modeling_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 4669e5a89b31..9591f425098b 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -344,7 +344,7 @@ def floating_point_ops( """ Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a batch with this transformer model. Default approximation neglects the quadratic dependency on the number of - tokens (valid if 12 * d_model << sequence_length) as laid out in https://arxiv.org/pdf/2001.08361.pdf section + tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper `__ section 2.1. Should be overriden for transformers with parameter re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths. From eb9d328bbf6ebb37013b7df00e2f5b0e9342fd8b Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Mon, 31 Aug 2020 17:21:10 +0200 Subject: [PATCH 26/26] black --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 298a8f045e65..1f6ef4048d47 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -656,7 +656,7 @@ def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", D try: self.global_step = int(model_path.split("-")[-1].split(os.path.sep)[0]) self.total_flos = getattr(model.config, "total_flos", 0) - + epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps