From 144f566111b412f4be852a6fa83ba2fdebeb0d4e Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Tue, 25 Jan 2022 07:15:01 +0000
Subject: [PATCH 1/3] Logging plus doc changes

---
 .../algorithms/label_smoothing/label_smoothing.py  |  2 +-
 composer/core/types.py                             |  8 ++++----
 composer/trainer/checkpoint.py                     | 14 +++++++++-----
 composer/trainer/trainer.py                        |  4 +++-
 composer/trainer/trainer_hparams.py                |  2 +-
 composer/yamls/models/classify_mnist.yaml          |  6 ++++++
 6 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/composer/algorithms/label_smoothing/label_smoothing.py b/composer/algorithms/label_smoothing/label_smoothing.py
index 26b5b01f2c..da89988e98 100644
--- a/composer/algorithms/label_smoothing/label_smoothing.py
+++ b/composer/algorithms/label_smoothing/label_smoothing.py
@@ -67,7 +67,7 @@ def smooth_labels(logits: Tensor, targets: Tensor, alpha: float):
     as in `Szegedy et al. <https://arxiv.org/abs/1512.00567>`_.
 
     This is computed by ``(1 - alpha) * targets + alpha * smoothed_targets``
-    where ``smoothed_targets`` is a vector of ones.
+    where ``smoothed_targets`` is a uniform distribution.
 
     Args:
         logits: Output of the model. Tensor of shape (N, C, d1, ..., dn) for
diff --git a/composer/core/types.py b/composer/core/types.py
index 2646964c5f..c248afc7b7 100644
--- a/composer/core/types.py
+++ b/composer/core/types.py
@@ -46,7 +46,7 @@
 
 def as_batch_dict(batch: Batch) -> BatchDict:
     """Casts a :class:`Batch` as a :class:`BatchDict`.
-    
+
     Args:
         batch (Batch): A batch.
     Raises:
@@ -83,7 +83,7 @@ def as_batch_pair(batch: Batch) -> BatchPair:
 
 class BreakEpochException(Exception):
     """Raising this exception will immediately end the current epoch.
-    
+
     If you're wondering whether you should use this, the answer is no.
     """
 
@@ -96,8 +96,8 @@ class DataLoader(Protocol):
 
     Attributes:
         dataset (Dataset): Dataset from which to load the data.
-        batch_size (int, optional): How many samples per batch to load
-            (default: ``1``).
+        batch_size (int, optional): How many samples per batch to load for a
+            single device (default: ``1``).
         num_workers (int): How many subprocesses to use for data loading.
             ``0`` means that the data will be loaded in the main process.
         pin_memory (bool): If ``True``, the data loader will copy Tensors
diff --git a/composer/trainer/checkpoint.py b/composer/trainer/checkpoint.py
index d8a7d0097b..6b51da8495 100755
--- a/composer/trainer/checkpoint.py
+++ b/composer/trainer/checkpoint.py
@@ -11,6 +11,7 @@
 import textwrap
 import urllib.parse
 import warnings
+from logging import INFO
 from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, cast
 
 import numpy as np
@@ -47,18 +48,18 @@ class CheckpointLoader:
         checkpoint (str): The template path to an existing checkpoint file.
             It can be a path to a file on local disk, a URL, or if ``object_store_hparams`` is set, the object name
             for a checkpoint in a cloud bucket.
-            
+
             When using Deepspeed zero, the :class:`CheckpointSaver` shards checkpoints by rank. To load deepspeed checkpoints,
             specify ``{RANK}`` in in the ``checkpoint`` parameter, and this variable will be substituted with the global rank.
             For example, suppose that checkpoints are stored in the following structure:
-        
+
             .. code-block::
 
                 my_model/rank_0/ep1.tar
                 my_model/rank_1/ep1.tar
                 my_model/rank_2/ep1.tar
                 ...
-        
+
             Then, ``checkpoint`` should be set to ``my_model/rank_{RANK}/ep1.tar``, and all ranks will load the correct
             data.
 
@@ -189,10 +190,10 @@ def _download_checkpoint(self, node_checkpoint_folder: str) -> Tuple[str, Option
             self._retrieve_checkpoint(destination_filepath=rank_zero_checkpoint_archive_filepath,
                                       rank=dist.get_global_rank(),
                                       ignore_not_found_errors=False)
-
             if extracted_checkpoint_folder is not None:
                 try:
                     with tarfile.open(rank_zero_checkpoint_archive_filepath) as tarball:
+                        # with tarfile.open("ep10.tar") as tarball:
                         tarball.extractall(extracted_checkpoint_folder)
                 except FileNotFoundError as e:
                     checkpoint_name = self.hparams.checkpoint.format(rank=dist.get_global_rank())
@@ -235,7 +236,7 @@ def _restore_checkpoint(self, state: State, mosaic_checkpoint_filepath: str,
         """
         # Now, all ranks load the checkpoint that local rank zero downloaded
         state_dict = torch.load(mosaic_checkpoint_filepath, map_location='cpu')
-
+        log.debug(f"Loaded checkpoint with keys {state_dict.keys()} and state with keys {state_dict['state'].keys()}")
         seed_to_restore = None
 
         if is_module_deepspeed(state.model):
@@ -287,6 +288,9 @@ def load_checkpoint(self, state: State):
             mosaic_checkpoint_filepath, extracted_checkpoint_folder = self._download_checkpoint(node_checkpoint_folder)
             seed_to_restore = self._restore_checkpoint(state, mosaic_checkpoint_filepath, extracted_checkpoint_folder)
 
+        log.info(f'{"Model weights" if self.hparams.load_weights_only else "Trainer checkpoint"}'
+                 f' loaded from {self.hparams.checkpoint}.')
+
         return seed_to_restore
 
     def restore_checkpoint_rng_state(self, device: Device):
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index db0fd0a29c..98be06aab2 100755
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -57,7 +57,8 @@ class Trainer:
             or dict of :class:`DataSpec` kwargs for the training data.
         eval_dataloader (DataLoader, DataSpec, or dict): The :class:`DataLoader`, :class:`DataSpec`,
             or dict of :class:`DataSpec` kwargs for the evaluation data.
-        max_epochs (int): The maxmimum number of epochs to train for.
+        max_duration (Union[str, `~composer.core.Time`]): The maxmimum number amount of Time to train for.
+            See `~composer.core.Time` for details.
         algorithms (List[Algorithm], optional): The algorithms to use during training.
             (default: ``[]``)
         optimizer_hparams: (OptimizerHparams, optional): The OptimizerHparams for constructing
@@ -356,6 +357,7 @@ def create_from_hparams(cls, hparams: TrainerHparams) -> Trainer:
 
         hparams.validate()
         import composer
+        logging.basicConfig()
         logging.getLogger(composer.__name__).setLevel(hparams.log_level)
 
         # devices and systems
diff --git a/composer/trainer/trainer_hparams.py b/composer/trainer/trainer_hparams.py
index e7ac1288b7..c47eee8a68 100755
--- a/composer/trainer/trainer_hparams.py
+++ b/composer/trainer/trainer_hparams.py
@@ -196,7 +196,7 @@ class TrainerHparams(hp.Hparams):
                                            default=False)
 
     compute_training_metrics: bool = hp.optional(doc="Log validation metrics on training data", default=False)
-    log_level: str = hp.optional(doc="Python loglevel to use composer", default="WARNING")
+    log_level: str = hp.optional(doc="Python loglevel to use composer", default="INFO")
     datadir: Optional[str] = hp.optional(doc=textwrap.dedent("""
         Datadir to apply for both the training and validation datasets. If specified,
         it will override train_dataset.datadir and val_dataset.datadir"""),
diff --git a/composer/yamls/models/classify_mnist.yaml b/composer/yamls/models/classify_mnist.yaml
index 7f5a8468ef..a81ad5f8fd 100644
--- a/composer/yamls/models/classify_mnist.yaml
+++ b/composer/yamls/models/classify_mnist.yaml
@@ -43,3 +43,9 @@ dataloader:
 validate_every_n_epochs: 1
 grad_accum: 1
 precision: amp
+load_checkpoint:
+  checkpoint: mosaic_states.pt
+  # checkpoint: "runs/2022-01-25T02:51:46.406392/rank_0/checkpoints/ep10.tar"
+# save_checkpoint:
+#   interval_unit: ep
+#   interval: 10

From c95a6c92b482e2768f819e111c9e9d746c0b7834 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Tue, 25 Jan 2022 07:19:47 +0000
Subject: [PATCH 2/3] Small fixes

---
 composer/trainer/checkpoint.py            | 2 --
 composer/yamls/models/classify_mnist.yaml | 6 ------
 2 files changed, 8 deletions(-)

diff --git a/composer/trainer/checkpoint.py b/composer/trainer/checkpoint.py
index 6b51da8495..2041b9574a 100755
--- a/composer/trainer/checkpoint.py
+++ b/composer/trainer/checkpoint.py
@@ -11,7 +11,6 @@
 import textwrap
 import urllib.parse
 import warnings
-from logging import INFO
 from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, cast
 
 import numpy as np
@@ -193,7 +192,6 @@ def _download_checkpoint(self, node_checkpoint_folder: str) -> Tuple[str, Option
             if extracted_checkpoint_folder is not None:
                 try:
                     with tarfile.open(rank_zero_checkpoint_archive_filepath) as tarball:
-                        # with tarfile.open("ep10.tar") as tarball:
                         tarball.extractall(extracted_checkpoint_folder)
                 except FileNotFoundError as e:
                     checkpoint_name = self.hparams.checkpoint.format(rank=dist.get_global_rank())
diff --git a/composer/yamls/models/classify_mnist.yaml b/composer/yamls/models/classify_mnist.yaml
index a81ad5f8fd..7f5a8468ef 100644
--- a/composer/yamls/models/classify_mnist.yaml
+++ b/composer/yamls/models/classify_mnist.yaml
@@ -43,9 +43,3 @@ dataloader:
 validate_every_n_epochs: 1
 grad_accum: 1
 precision: amp
-load_checkpoint:
-  checkpoint: mosaic_states.pt
-  # checkpoint: "runs/2022-01-25T02:51:46.406392/rank_0/checkpoints/ep10.tar"
-# save_checkpoint:
-#   interval_unit: ep
-#   interval: 10

From 2ade05582a2be79a1c7749b0539797ca3897e2f2 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajay@mosaicml.com>
Date: Tue, 25 Jan 2022 07:24:19 +0000
Subject: [PATCH 3/3] Removed extra lne

---
 composer/trainer/trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index 98be06aab2..48e562e304 100755
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -357,7 +357,6 @@ def create_from_hparams(cls, hparams: TrainerHparams) -> Trainer:
 
         hparams.validate()
         import composer
-        logging.basicConfig()
         logging.getLogger(composer.__name__).setLevel(hparams.log_level)
 
         # devices and systems