From 144f566111b412f4be852a6fa83ba2fdebeb0d4e Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Tue, 25 Jan 2022 07:15:01 +0000 Subject: [PATCH 1/3] Logging plus doc changes --- .../algorithms/label_smoothing/label_smoothing.py | 2 +- composer/core/types.py | 8 ++++---- composer/trainer/checkpoint.py | 14 +++++++++----- composer/trainer/trainer.py | 4 +++- composer/trainer/trainer_hparams.py | 2 +- composer/yamls/models/classify_mnist.yaml | 6 ++++++ 6 files changed, 24 insertions(+), 12 deletions(-) diff --git a/composer/algorithms/label_smoothing/label_smoothing.py b/composer/algorithms/label_smoothing/label_smoothing.py index 26b5b01f2c..da89988e98 100644 --- a/composer/algorithms/label_smoothing/label_smoothing.py +++ b/composer/algorithms/label_smoothing/label_smoothing.py @@ -67,7 +67,7 @@ def smooth_labels(logits: Tensor, targets: Tensor, alpha: float): as in `Szegedy et al. `_. This is computed by ``(1 - alpha) * targets + alpha * smoothed_targets`` - where ``smoothed_targets`` is a vector of ones. + where ``smoothed_targets`` is a uniform distribution. Args: logits: Output of the model. Tensor of shape (N, C, d1, ..., dn) for diff --git a/composer/core/types.py b/composer/core/types.py index 2646964c5f..c248afc7b7 100644 --- a/composer/core/types.py +++ b/composer/core/types.py @@ -46,7 +46,7 @@ def as_batch_dict(batch: Batch) -> BatchDict: """Casts a :class:`Batch` as a :class:`BatchDict`. - + Args: batch (Batch): A batch. Raises: @@ -83,7 +83,7 @@ def as_batch_pair(batch: Batch) -> BatchPair: class BreakEpochException(Exception): """Raising this exception will immediately end the current epoch. - + If you're wondering whether you should use this, the answer is no. """ @@ -96,8 +96,8 @@ class DataLoader(Protocol): Attributes: dataset (Dataset): Dataset from which to load the data. - batch_size (int, optional): How many samples per batch to load - (default: ``1``). + batch_size (int, optional): How many samples per batch to load for a + single device (default: ``1``). num_workers (int): How many subprocesses to use for data loading. ``0`` means that the data will be loaded in the main process. pin_memory (bool): If ``True``, the data loader will copy Tensors diff --git a/composer/trainer/checkpoint.py b/composer/trainer/checkpoint.py index d8a7d0097b..6b51da8495 100755 --- a/composer/trainer/checkpoint.py +++ b/composer/trainer/checkpoint.py @@ -11,6 +11,7 @@ import textwrap import urllib.parse import warnings +from logging import INFO from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, cast import numpy as np @@ -47,18 +48,18 @@ class CheckpointLoader: checkpoint (str): The template path to an existing checkpoint file. It can be a path to a file on local disk, a URL, or if ``object_store_hparams`` is set, the object name for a checkpoint in a cloud bucket. - + When using Deepspeed zero, the :class:`CheckpointSaver` shards checkpoints by rank. To load deepspeed checkpoints, specify ``{RANK}`` in in the ``checkpoint`` parameter, and this variable will be substituted with the global rank. For example, suppose that checkpoints are stored in the following structure: - + .. code-block:: my_model/rank_0/ep1.tar my_model/rank_1/ep1.tar my_model/rank_2/ep1.tar ... - + Then, ``checkpoint`` should be set to ``my_model/rank_{RANK}/ep1.tar``, and all ranks will load the correct data. @@ -189,10 +190,10 @@ def _download_checkpoint(self, node_checkpoint_folder: str) -> Tuple[str, Option self._retrieve_checkpoint(destination_filepath=rank_zero_checkpoint_archive_filepath, rank=dist.get_global_rank(), ignore_not_found_errors=False) - if extracted_checkpoint_folder is not None: try: with tarfile.open(rank_zero_checkpoint_archive_filepath) as tarball: + # with tarfile.open("ep10.tar") as tarball: tarball.extractall(extracted_checkpoint_folder) except FileNotFoundError as e: checkpoint_name = self.hparams.checkpoint.format(rank=dist.get_global_rank()) @@ -235,7 +236,7 @@ def _restore_checkpoint(self, state: State, mosaic_checkpoint_filepath: str, """ # Now, all ranks load the checkpoint that local rank zero downloaded state_dict = torch.load(mosaic_checkpoint_filepath, map_location='cpu') - + log.debug(f"Loaded checkpoint with keys {state_dict.keys()} and state with keys {state_dict['state'].keys()}") seed_to_restore = None if is_module_deepspeed(state.model): @@ -287,6 +288,9 @@ def load_checkpoint(self, state: State): mosaic_checkpoint_filepath, extracted_checkpoint_folder = self._download_checkpoint(node_checkpoint_folder) seed_to_restore = self._restore_checkpoint(state, mosaic_checkpoint_filepath, extracted_checkpoint_folder) + log.info(f'{"Model weights" if self.hparams.load_weights_only else "Trainer checkpoint"}' + f' loaded from {self.hparams.checkpoint}.') + return seed_to_restore def restore_checkpoint_rng_state(self, device: Device): diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index db0fd0a29c..98be06aab2 100755 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -57,7 +57,8 @@ class Trainer: or dict of :class:`DataSpec` kwargs for the training data. eval_dataloader (DataLoader, DataSpec, or dict): The :class:`DataLoader`, :class:`DataSpec`, or dict of :class:`DataSpec` kwargs for the evaluation data. - max_epochs (int): The maxmimum number of epochs to train for. + max_duration (Union[str, `~composer.core.Time`]): The maxmimum number amount of Time to train for. + See `~composer.core.Time` for details. algorithms (List[Algorithm], optional): The algorithms to use during training. (default: ``[]``) optimizer_hparams: (OptimizerHparams, optional): The OptimizerHparams for constructing @@ -356,6 +357,7 @@ def create_from_hparams(cls, hparams: TrainerHparams) -> Trainer: hparams.validate() import composer + logging.basicConfig() logging.getLogger(composer.__name__).setLevel(hparams.log_level) # devices and systems diff --git a/composer/trainer/trainer_hparams.py b/composer/trainer/trainer_hparams.py index e7ac1288b7..c47eee8a68 100755 --- a/composer/trainer/trainer_hparams.py +++ b/composer/trainer/trainer_hparams.py @@ -196,7 +196,7 @@ class TrainerHparams(hp.Hparams): default=False) compute_training_metrics: bool = hp.optional(doc="Log validation metrics on training data", default=False) - log_level: str = hp.optional(doc="Python loglevel to use composer", default="WARNING") + log_level: str = hp.optional(doc="Python loglevel to use composer", default="INFO") datadir: Optional[str] = hp.optional(doc=textwrap.dedent(""" Datadir to apply for both the training and validation datasets. If specified, it will override train_dataset.datadir and val_dataset.datadir"""), diff --git a/composer/yamls/models/classify_mnist.yaml b/composer/yamls/models/classify_mnist.yaml index 7f5a8468ef..a81ad5f8fd 100644 --- a/composer/yamls/models/classify_mnist.yaml +++ b/composer/yamls/models/classify_mnist.yaml @@ -43,3 +43,9 @@ dataloader: validate_every_n_epochs: 1 grad_accum: 1 precision: amp +load_checkpoint: + checkpoint: mosaic_states.pt + # checkpoint: "runs/2022-01-25T02:51:46.406392/rank_0/checkpoints/ep10.tar" +# save_checkpoint: +# interval_unit: ep +# interval: 10 From c95a6c92b482e2768f819e111c9e9d746c0b7834 Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Tue, 25 Jan 2022 07:19:47 +0000 Subject: [PATCH 2/3] Small fixes --- composer/trainer/checkpoint.py | 2 -- composer/yamls/models/classify_mnist.yaml | 6 ------ 2 files changed, 8 deletions(-) diff --git a/composer/trainer/checkpoint.py b/composer/trainer/checkpoint.py index 6b51da8495..2041b9574a 100755 --- a/composer/trainer/checkpoint.py +++ b/composer/trainer/checkpoint.py @@ -11,7 +11,6 @@ import textwrap import urllib.parse import warnings -from logging import INFO from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, cast import numpy as np @@ -193,7 +192,6 @@ def _download_checkpoint(self, node_checkpoint_folder: str) -> Tuple[str, Option if extracted_checkpoint_folder is not None: try: with tarfile.open(rank_zero_checkpoint_archive_filepath) as tarball: - # with tarfile.open("ep10.tar") as tarball: tarball.extractall(extracted_checkpoint_folder) except FileNotFoundError as e: checkpoint_name = self.hparams.checkpoint.format(rank=dist.get_global_rank()) diff --git a/composer/yamls/models/classify_mnist.yaml b/composer/yamls/models/classify_mnist.yaml index a81ad5f8fd..7f5a8468ef 100644 --- a/composer/yamls/models/classify_mnist.yaml +++ b/composer/yamls/models/classify_mnist.yaml @@ -43,9 +43,3 @@ dataloader: validate_every_n_epochs: 1 grad_accum: 1 precision: amp -load_checkpoint: - checkpoint: mosaic_states.pt - # checkpoint: "runs/2022-01-25T02:51:46.406392/rank_0/checkpoints/ep10.tar" -# save_checkpoint: -# interval_unit: ep -# interval: 10 From 2ade05582a2be79a1c7749b0539797ca3897e2f2 Mon Sep 17 00:00:00 2001 From: Ajay Saini Date: Tue, 25 Jan 2022 07:24:19 +0000 Subject: [PATCH 3/3] Removed extra lne --- composer/trainer/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 98be06aab2..48e562e304 100755 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -357,7 +357,6 @@ def create_from_hparams(cls, hparams: TrainerHparams) -> Trainer: hparams.validate() import composer - logging.basicConfig() logging.getLogger(composer.__name__).setLevel(hparams.log_level) # devices and systems