diff --git a/.ci/test_lint_doctests.py b/.ci/test_lint_doctests.py index 11f425fdf9..e6dbfdd58d 100644 --- a/.ci/test_lint_doctests.py +++ b/.ci/test_lint_doctests.py @@ -58,7 +58,7 @@ def test_run_doctests(): @pytest.mark.timeout(30) def test_docker_build_matrix(): - """Test that the docker build matrix is up to date""" + """Test that the docker build matrix is up to date.""" docker_folder = pathlib.Path(os.path.dirname(__file__)) / '..' / 'docker' # Capture the existing readme and build matrix contents @@ -86,7 +86,7 @@ def test_docker_build_matrix(): @pytest.mark.parametrize("example", [1, 2]) def test_release_tests_reflect_readme(example: int): - """Test that example_1.py and example_2.py in release_tests reflect the README.md""" + """Test that example_1.py and example_2.py in release_tests reflect the README.md.""" with open(pathlib.Path(os.path.dirname(__file__)) / '..' / 'README.md', 'r') as f: readme_lines = f.readlines() example_code_lines = [] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2658217054..aa3b9d36e5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -36,16 +36,17 @@ repos: pass_filenames: false args: [--warnings] additional_dependencies: ["pyright@1.1.247"] - # - repo: https://github.com/PyCQA/pydocstyle - # hooks: - # - id: pydocstyle - # name: pydocstyle - # entry: pydocstyle - # language: python - # types: [python] - # additional_dependencies: - # - "toml" - # rev: 6.1.1 + - repo: https://github.com/PyCQA/pydocstyle + hooks: + - id: pydocstyle + name: pydocstyle + entry: pydocstyle + language: python + types: [python] + exclude: '(?:tests|.ci|composer\/algorithms|composer\/datasets|composer\/models)\/.*' + additional_dependencies: + - "toml" + rev: 6.1.1 - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.1.0 hooks: @@ -72,13 +73,11 @@ repos: - id: check-yaml - id: debug-statements - id: destroyed-symlinks - # - id: double-quote-string-fixer # TODO(ravi): Enable this check later. Generates a large diff. + # - id: double-quote-string-fixer # TODO(ravi): Enable this check later. Generates a large diff. - id: end-of-file-fixer - id: fix-byte-order-marker - id: mixed-line-ending - id: trailing-whitespace - # - id: name-tests-test # TODO(ravi): Enable this check later. Generates a large diff. - # args: ['--django'] - repo: https://github.com/Lucas-C/pre-commit-hooks rev: v1.1.13 hooks: diff --git a/composer/__init__.py b/composer/__init__.py index 9d2d4303ae..c13042bb25 100644 --- a/composer/__init__.py +++ b/composer/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Composer.""" + from composer import algorithms as algorithms from composer import callbacks as callbacks from composer import datasets as datasets diff --git a/composer/algorithms/augmix/augmix.py b/composer/algorithms/augmix/augmix.py index c71c572d42..041a01884a 100644 --- a/composer/algorithms/augmix/augmix.py +++ b/composer/algorithms/augmix/augmix.py @@ -32,13 +32,13 @@ def augmix_image(img: ImgT, width: int = 3, alpha: float = 1.0, augmentation_set: List = augmentation_sets["all"]) -> ImgT: - """Applies AugMix (`Hendrycks et al, 2020 `_) data - augmentation to a single image or batch of images. See - :class:`.AugMix` and the - :doc:`Method Card ` for details. This function only acts on a - single image (or batch) per call and is unlikely to be used in a training loop. Use - :class:`~composer.algorithms.augmix.augmix.AugmentAndMixTransform` to use AugMix as - part of a :class:`torchvision.datasets.VisionDataset`\\'s ``transform``. + r"""Applies the AugMix (`Hendrycks et al, 2020 `_) data augmentation. + + This function works on a single image or batch of images. See :class:`.AugMix` and + the :doc:`Method Card ` for details. This function only acts on a + single image (or batch) per call and is unlikely to be used in a training loop. + Use :class:`~composer.algorithms.augmix.augmix.AugmentAndMixTransform` to use AugMix as + part of a :class:`torchvision.datasets.VisionDataset`\'s ``transform``. Example: .. testcode:: @@ -166,7 +166,9 @@ def forward(self, img: PillowImage) -> PillowImage: class AugMix(Algorithm): - """AugMix (`Hendrycks et al, 2020 `_) creates ``width`` sequences of ``depth`` + r"""The AugMix data augmentation technique. + + AugMix (`Hendrycks et al, 2020 `_) creates ``width`` sequences of ``depth`` image augmentations, applies each sequence with random intensity, and returns a convex combination of the ``width`` augmented images and the original image. The coefficients for mixing the augmented images are drawn from a uniform ``Dirichlet(alpha, alpha, ...)`` distribution. The coefficient for mixing the combined augmented image and the @@ -224,7 +226,7 @@ class AugMix(Algorithm): ``"color"``, ``"contrast"``, ``"sharpness"``, and ``"brightness"``. The original implementations have an intensity sampling scheme that samples a value bounded by 0.118 at a minimum, and a maximum value of - :math:`intensity \\times 0.18 + .1`, which ranges from 0.28 (intensity = 1) + :math:`intensity \times 0.18 + .1`, which ranges from 0.28 (intensity = 1) to 1.9 (intensity 10). These augmentations have different effects depending on whether they are < 0 or > 0 (or < 1 or > 1). "all" uses implementations of "color", "contrast", diff --git a/composer/algorithms/blurpool/blurpool_layers.py b/composer/algorithms/blurpool/blurpool_layers.py index cfdd8d2c11..a1817ca7a8 100644 --- a/composer/algorithms/blurpool/blurpool_layers.py +++ b/composer/algorithms/blurpool/blurpool_layers.py @@ -31,7 +31,7 @@ def _padding_for_filt_2d_same(filt: torch.Tensor): def blur_2d(input: torch.Tensor, stride: _size_2_t = 1, filter: Optional[torch.Tensor] = None) -> torch.Tensor: - """Apply a spatial low-pass filter. + """Applies a spatial low-pass filter. Args: input (:class:`torch.Tensor`): a 4d tensor of shape NCHW diff --git a/composer/algorithms/cutout/cutout.py b/composer/algorithms/cutout/cutout.py index 988c7cb72a..f5058eead1 100644 --- a/composer/algorithms/cutout/cutout.py +++ b/composer/algorithms/cutout/cutout.py @@ -129,7 +129,7 @@ def match(self, event: Event, state: State) -> bool: return event == Event.AFTER_DATALOADER def apply(self, event: Event, state: State, logger: Logger) -> Optional[int]: - """Apply cutout on input images.""" + """Applies cutout on input images.""" x = state.batch_get_item(self.input_key) assert isinstance(x, Tensor), "Multiple tensors not supported for Cutout." diff --git a/composer/algorithms/progressive_resizing/__init__.py b/composer/algorithms/progressive_resizing/__init__.py index fb9969bb53..54f90aea97 100644 --- a/composer/algorithms/progressive_resizing/__init__.py +++ b/composer/algorithms/progressive_resizing/__init__.py @@ -1,7 +1,7 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 -"""Apply Fastai's `progressive resizing `__ data augmentation to speed up training. Progressive resizing initially reduces input resolution to speed up early training. Throughout training, the diff --git a/composer/algorithms/progressive_resizing/progressive_resizing.py b/composer/algorithms/progressive_resizing/progressive_resizing.py index 49e7214307..71a45d57a2 100644 --- a/composer/algorithms/progressive_resizing/progressive_resizing.py +++ b/composer/algorithms/progressive_resizing/progressive_resizing.py @@ -112,7 +112,9 @@ def resize_batch(input: torch.Tensor, class ProgressiveResizing(Algorithm): - """Apply Fastai's `progressive resizing `__ data augmentation to speed up training. @@ -204,6 +206,7 @@ def match(self, event: Event, state: State) -> bool: Args: event (:class:`Event`): The current event. state (:class:`State`): The current state. + Returns: bool: True if this algorithm should run now """ @@ -271,8 +274,11 @@ def _make_crop(tensor: torch.Tensor, scale_factor: float) -> T_ResizeTransform: def _make_crop_pair(X: torch.Tensor, y: torch.Tensor, scale_factor: float) -> Tuple[T_ResizeTransform, T_ResizeTransform]: - """Makes a pair of random crops for an input image X and target tensor y such that the same region is selected from - both.""" + """Makes a pair of random crops. + + Crops input image X and target tensor y such that the same region is selected from + both. + """ # New height and width for X HcX = int(scale_factor * X.shape[2]) WcX = int(scale_factor * X.shape[3]) diff --git a/composer/algorithms/squeeze_excite/squeeze_excite.py b/composer/algorithms/squeeze_excite/squeeze_excite.py index 5c88f1989b..60647c3963 100644 --- a/composer/algorithms/squeeze_excite/squeeze_excite.py +++ b/composer/algorithms/squeeze_excite/squeeze_excite.py @@ -162,7 +162,7 @@ def match(self, event: Event, state: State) -> bool: return event == Event.INIT def apply(self, event: Event, state: State, logger: Logger) -> Optional[int]: - """Apply the Squeeze-and-Excitation layer replacement. + """Applies the Squeeze-and-Excitation layer replacement. Args: event (Event): the current event diff --git a/composer/algorithms/stochastic_depth/stochastic_depth.py b/composer/algorithms/stochastic_depth/stochastic_depth.py index 29747511fc..7e4375445b 100644 --- a/composer/algorithms/stochastic_depth/stochastic_depth.py +++ b/composer/algorithms/stochastic_depth/stochastic_depth.py @@ -191,7 +191,6 @@ def __init__(self, @property def find_unused_parameters(self) -> bool: """DDP parameter to notify that parameters may not have gradients if it is dropped during the forward pass.""" - return (self.stochastic_method == "block") def match(self, event: Event, state: State) -> bool: diff --git a/composer/algorithms/swa/swa.py b/composer/algorithms/swa/swa.py index 06b54b11c5..b2120705b0 100644 --- a/composer/algorithms/swa/swa.py +++ b/composer/algorithms/swa/swa.py @@ -26,7 +26,7 @@ def _assert_valid_duration(time: Time): class SWA(Algorithm): - """Apply Stochastic Weight Averaging (`Izmailov et al, 2018 `_) + """Applies Stochastic Weight Averaging (`Izmailov et al, 2018 `_) Stochastic Weight Averaging (SWA) averages model weights sampled at different times near the end of training. This leads to better diff --git a/composer/algorithms/utils/augmentation_primitives.py b/composer/algorithms/utils/augmentation_primitives.py index 80b5e345d2..226a5549d4 100644 --- a/composer/algorithms/utils/augmentation_primitives.py +++ b/composer/algorithms/utils/augmentation_primitives.py @@ -76,6 +76,7 @@ def _float_parameter(level: float, maxval: float): level (float): Level of the operation that will be between [0, 10]. maxval (float): Maximum value that the operation can have. This will be scaled to level/10. + Returns: float: The result from scaling ``maxval`` according to ``level``. """ @@ -88,8 +89,10 @@ def _sample_level(n: float): def _symmetric_sample(level: float): - """Helper function to sample from a distribution over the domain [0.1, 10] with median == 1 and uniform probability - of x | 0.1 ≤ x ≤ 1, and x | 1 ≤ x ≤ 10. + """Helper function to sample from a symmetric distribution. + + The distribution over the domain [0.1, 10] with median == 1 and uniform probability of x | 0.1 ≤ x ≤ 1, + and x | 1 ≤ x ≤ 10. Used for sampling transforms that can range from intensity 0 to infinity, and for which an intensity of 1 == no change. @@ -106,7 +109,8 @@ def autocontrast(pil_img: Image.Image, level: float = 0.0): .. seealso:: :func:`PIL.ImageOps.autocontrast`. Args: - pil_img (Image.Image): The image + pil_img (Image.Image): The image. + level (float): The intensity. """ del level # unused return ImageOps.autocontrast(pil_img) @@ -118,7 +122,8 @@ def equalize(pil_img: Image.Image, level: float): .. seealso:: :func:`PIL.ImageOps.equalize`. Args: - pil_img (Image.Image): The image + pil_img (Image.Image): The image. + level (float): The intensity. """ del level # unused return ImageOps.equalize(pil_img) diff --git a/composer/callbacks/callback_hparams_registry.py b/composer/callbacks/callback_hparams_registry.py index 83eee2eebe..77c3fe26e7 100644 --- a/composer/callbacks/callback_hparams_registry.py +++ b/composer/callbacks/callback_hparams_registry.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Hyperparameter registry for callbacks.""" + from typing import Dict, Type, Union import yahp as hp diff --git a/composer/callbacks/checkpoint_saver.py b/composer/callbacks/checkpoint_saver.py index 807135a485..1047d98d99 100644 --- a/composer/callbacks/checkpoint_saver.py +++ b/composer/callbacks/checkpoint_saver.py @@ -27,11 +27,11 @@ def checkpoint_periodically(interval: Union[str, int, Time]) -> Callable[[State, Event], bool]: - """Helper function to create a checkpoint scheduler according to a specified interval. + r"""Helper function to create a checkpoint scheduler according to a specified interval. Args: interval (Union[str, int, :class:`.Time`]): The interval describing how often checkpoints should be - saved. If an integer, it will be assumed to be in :attr:`.TimeUnit.EPOCH`\\s. + saved. If an integer, it will be assumed to be in :attr:`.TimeUnit.EPOCH`\s. Otherwise, the unit must be either :attr:`.TimeUnit.EPOCH` or :attr:`.TimeUnit.BATCH`. Checkpoints will be saved every ``n`` batches or epochs (depending on the unit), @@ -85,7 +85,7 @@ def save_interval(state: State, event: Event): return save_interval -class CheckpointSaver(Callback): +class CheckpointSaver(Callback): # noqa: D101 __doc__ = f"""Callback to save checkpoints. .. note:: @@ -114,10 +114,6 @@ class CheckpointSaver(Callback): ... ) ... ]) - .. testcleanup:: - - trainer.engine.close() - Args: folder (str, optional): Format string for the folder where checkpoints will be saved. Default: ``'{{run_name}}/checkpoints'``. diff --git a/composer/callbacks/early_stopper.py b/composer/callbacks/early_stopper.py index abce6c82d1..c55b7eeff8 100644 --- a/composer/callbacks/early_stopper.py +++ b/composer/callbacks/early_stopper.py @@ -21,11 +21,9 @@ class EarlyStopper(Callback): - """This callback tracks a training or evaluation metric and halts training if the metric does not - improve within a given interval. - - Example + """Halt training if a metric does not improve within a given interval. + Example: .. doctest:: >>> from composer.callbacks.early_stopper import EarlyStopper diff --git a/composer/callbacks/grad_monitor.py b/composer/callbacks/grad_monitor.py index 5fbb06f9bb..d52b1a7d49 100644 --- a/composer/callbacks/grad_monitor.py +++ b/composer/callbacks/grad_monitor.py @@ -17,8 +17,7 @@ class GradMonitor(Callback): the model and hence may cause a reduction in throughput while training large models. In order to ensure the correctness of norm, this function should be called after gradient unscaling in cases where gradients are scaled. - Example - + Example: .. doctest:: >>> from composer.callbacks import GradMonitor @@ -32,10 +31,6 @@ class GradMonitor(Callback): ... callbacks=[GradMonitor()], ... ) - .. testcleanup:: - - trainer.engine.close() - The L2 norms are logged by the :class:`~composer.loggers.logger.Logger` to the following keys as described below. +-----------------------------------+-------------------------------------------------------------+ @@ -57,7 +52,6 @@ class GradMonitor(Callback): """ def __init__(self, log_layer_grad_norms: bool = False): - super().__init__() self.log_layer_grad_norms = log_layer_grad_norms def after_train_batch(self, state: State, logger: Logger): diff --git a/composer/callbacks/lr_monitor.py b/composer/callbacks/lr_monitor.py index df25b06a38..f8f2d88732 100644 --- a/composer/callbacks/lr_monitor.py +++ b/composer/callbacks/lr_monitor.py @@ -14,8 +14,7 @@ class LRMonitor(Callback): This callback iterates over all optimizers and their parameter groups to log learning rate under the ``lr-{OPTIMIZER_NAME}/group{GROUP_NUMBER}`` key. - Example - + Example: .. doctest:: >>> from composer.callbacks import LRMonitor @@ -29,10 +28,6 @@ class LRMonitor(Callback): ... callbacks=[LRMonitor()], ... ) - .. testcleanup:: - - trainer.engine.close() - The learning rate is logged by the :class:`~composer.loggers.logger.Logger` to the following key as described below. @@ -46,7 +41,7 @@ class LRMonitor(Callback): """ def __init__(self) -> None: - super().__init__() + pass def batch_end(self, state: State, logger: Logger): assert state.optimizers is not None, "optimizers must be defined" diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py index 6774def8dc..8a89f564cf 100644 --- a/composer/callbacks/memory_monitor.py +++ b/composer/callbacks/memory_monitor.py @@ -23,8 +23,7 @@ class MemoryMonitor(Callback): This callback calls the torch memory stats API for cuda (see :func:`torch.cuda.memory_stats`) on the :attr:`~composer.core.event.Event.AFTER_TRAIN_BATCH` and reports different memory statistics. - Example - + Example: .. doctest:: >>> from composer.callbacks import MemoryMonitor @@ -38,10 +37,6 @@ class MemoryMonitor(Callback): ... callbacks=[MemoryMonitor()], ... ) - .. testcleanup:: - - trainer.engine.close() - The memory statistics are logged by the :class:`~composer.loggers.logger.Logger` to the following keys as described below. diff --git a/composer/callbacks/mlperf.py b/composer/callbacks/mlperf.py index 1746909876..655ed5734e 100644 --- a/composer/callbacks/mlperf.py +++ b/composer/callbacks/mlperf.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Create compliant results file for MLPerf Training benchmark.""" + import json import logging import os @@ -44,14 +46,14 @@ def _local_rank_zero() -> bool: return dist.get_local_rank() == 0 -def require_mlperf_logging(): +def _require_mlperf_logging(): if not mlperf_available: raise ImportError("""Please install with `pip install 'mosaicml[mlperf]'` and also install the logging library from: https://github.com/mlcommons/logging""") class MLPerfCallback(Callback): - """Creates a compliant results file for MLPerf Training benchmark. + """Create compliant results file for MLPerf Training benchmark. A submission folder structure will be created with the ``root_folder`` as the base and the following directories:: @@ -73,18 +75,17 @@ class MLPerfCallback(Callback): Currently, only open division submissions are supported with this Callback. Example: + .. code-block:: python - .. code-block:: python - - from composer.callbacks import MLPerfCallback + from composer.callbacks import MLPerfCallback - callback = MLPerfCallback( - root_folder='/submission', - index=0, - metric_name='Accuracy', - metric_label='eval', - target='0.759', - ) + callback = MLPerfCallback( + root_folder='/submission', + index=0, + metric_name='Accuracy', + metric_label='eval', + target='0.759', + ) During training, the metric found in ``state.current_metrics[metric_label][metric_name]`` will be compared against the target criterion. @@ -138,7 +139,7 @@ def __init__( host_processors_per_node: Optional[int] = None, ) -> None: - require_mlperf_logging() + _require_mlperf_logging() if benchmark not in BENCHMARKS: raise ValueError(f"benchmark: {benchmark} must be one of {BENCHMARKS}") @@ -176,7 +177,6 @@ def __init__( self.success = False def init(self, state: State, logger: Logger) -> None: - # setup here requies access to rank, which is only available after # the trainer is initialized if _local_rank_zero(): @@ -242,8 +242,7 @@ def _get_accuracy(self, state: State) -> float: return float(metric) def _get_dataloader_stats(self, dataloader: Iterable): - """ returns tuple of (batch_size, num_samples)""" - + """Returns a tuple of ``(batch_size, num_samples)``.""" if isinstance(dataloader, DataLoader): return (dataloader.batch_size, len(dataloader.dataset)) # type: ignore try: diff --git a/composer/callbacks/speed_monitor.py b/composer/callbacks/speed_monitor.py index fb6a7f7772..2ef68dca2f 100644 --- a/composer/callbacks/speed_monitor.py +++ b/composer/callbacks/speed_monitor.py @@ -23,8 +23,7 @@ class SpeedMonitor(Callback): average throughput and wall clock train, validation, and total time is also logged on the :attr:`~composer.core.event.Event.EPOCH_END` event. - Example - + Example: .. doctest:: >>> from composer.callbacks import SpeedMonitor @@ -38,10 +37,6 @@ class SpeedMonitor(Callback): ... callbacks=[SpeedMonitor(window_size=100)], ... ) - .. testcleanup:: - - trainer.engine.close() - The training throughput is logged by the :class:`~composer.loggers.logger.Logger` to the following keys as described below. @@ -90,13 +85,6 @@ def __init__(self, window_size: int = 100): self.loaded_state: Optional[Dict[str, Any]] = None def state_dict(self) -> Dict[str, Any]: - """Returns a dictionary representing the internal state of the SpeedMonitor object. - - The returned dictionary is pickle-able via :func:`torch.save`. - - Returns: - Dict[str, Any]: The state of the SpeedMonitor object - """ current_time = time.time() return { "train_examples_per_epoch": self.train_examples_per_epoch, @@ -109,12 +97,6 @@ def state_dict(self) -> Dict[str, Any]: } def load_state_dict(self, state: Dict[str, Any]) -> None: - """Restores the state of SpeedMonitor object. - - Args: - state (Dict[str, Any]): The state of the object, - as previously returned by :meth:`.state_dict` - """ self.loaded_state = state def _load_state(self) -> None: diff --git a/composer/callbacks/threshold_stopper.py b/composer/callbacks/threshold_stopper.py index 49803487d1..9c267094fe 100644 --- a/composer/callbacks/threshold_stopper.py +++ b/composer/callbacks/threshold_stopper.py @@ -13,11 +13,9 @@ class ThresholdStopper(Callback): - """This callback tracks a training or evaluation metric and halts training when the - metric value reaches a certain threshold. - - Example + """Halt training when a metric value reaches a certain threshold. + Example: .. doctest:: >>> from composer.callbacks.threshold_stopper import ThresholdStopper diff --git a/composer/cli/__init__.py b/composer/cli/__init__.py index cba5725129..0eb6811a8e 100644 --- a/composer/cli/__init__.py +++ b/composer/cli/__init__.py @@ -1,2 +1,13 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 + +"""Composer CLI.""" + +import sys + +from composer.cli.launcher import main + +__all__ = ["main"] + +if __name__ == "__main__": + sys.exit(main()) diff --git a/composer/cli/launcher.py b/composer/cli/launcher.py index e82a21ca7a..71944bb984 100644 --- a/composer/cli/launcher.py +++ b/composer/cli/launcher.py @@ -1,5 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 + +"""The Composer CLI launcher for distributed training.""" + import datetime import logging import os @@ -21,7 +24,7 @@ log = logging.getLogger(__name__) -def get_parser(): +def _get_parser(): parser = ArgumentParser(description="Utility for launching distributed machine learning jobs.") required_args = parser.add_argument_group("required arguments") @@ -120,7 +123,7 @@ def _get_free_tcp_port() -> int: def _parse_args(): - parser = get_parser() + parser = _get_parser() args = parser.parse_args() @@ -399,6 +402,7 @@ def _aggregate_process_returncode(processes: Dict[int, subprocess.Popen]) -> int def main(): + """Entrypoint into the Composer CLI.""" args = _parse_args() logging.basicConfig() diff --git a/composer/core/algorithm.py b/composer/core/algorithm.py index ce086e6225..763c2c1da9 100644 --- a/composer/core/algorithm.py +++ b/composer/core/algorithm.py @@ -42,8 +42,7 @@ def __init__(self, *args, **kwargs): # Stub signature for PyRight @property def find_unused_parameters(self) -> bool: - """Return True to indicate that the effect of this algorithm may cause some model parameters to be unused. - Defaults to False. + """Indicates whether this algorithm may cause some model parameters to be unused. Defaults to False. For example, it is used to tell :class:`torch.nn.parallel.DistributedDataParallel` (DDP) that some parameters will be frozen during training and hence it should not expect gradients from them. All algorithms which do any @@ -66,11 +65,9 @@ def backwards_create_graph(self) -> bool: @abstractmethod def match(self, event: Event, state: State) -> bool: - """Determines whether this algorithm should run given the current :class:`~.event.Event` and - :class:`~.state.State`. + """Determines whether this algorithm should run given the current :class:`~.event.Event` and :class:`~.state.State`. Examples: - To only run on a specific event (e.g., on :attr:`~.Event.BEFORE_LOSS`), override match as shown below: >>> class MyAlgorithm: @@ -92,6 +89,7 @@ def match(self, event: Event, state: State) -> bool: Args: event (Event): The current event. state (State): The current state. + Returns: bool: True if this algorithm should run now. """ @@ -108,6 +106,7 @@ def apply(self, event: Event, state: State, logger: Logger) -> Optional[int]: event (Event): The current event. state (State): The current state. logger (Logger): A logger to use for logging algorithm-specific metrics. + Returns: int or None: exit code that will be stored in :class:`~.engine.Trace` and made accessible for debugging. """ diff --git a/composer/core/callback.py b/composer/core/callback.py index 30052c3a74..66e0244016 100644 --- a/composer/core/callback.py +++ b/composer/core/callback.py @@ -50,10 +50,6 @@ class Callback(Serializable, abc.ABC): >>> _ = trainer.engine.run_event(Event.EPOCH_START) Epoch: 0 - .. testcleanup:: - - trainer.engine.close() - #. Override :meth:`run_event` if you want a single method to handle all events. If this method is overridden, then the individual methods corresponding to each event name (such as :meth:`epoch_start`) will no longer be automatically invoked. For example, if you override :meth:`run_event` then :meth:`epoch_start` will not be called @@ -82,10 +78,6 @@ class Callback(Serializable, abc.ABC): >>> # is triggered, like this: >>> _ = trainer.engine.run_event(Event.EPOCH_START) Epoch: 0 - - .. testcleanup:: - - trainer.engine.close() """ def __init__(self, *args: Any, **kwargs: Any) -> None: @@ -94,7 +86,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: pass def run_event(self, event: Event, state: State, logger: Logger) -> None: - """This method is called by the engine on each event. + """Called by the engine on each event. Args: event (Event): The event. @@ -441,10 +433,9 @@ def close(self, state: State, logger: Logger) -> None: pass def post_close(self) -> None: - """This hook is called after :meth:`close` has been invoked for each callback. Very few callbacks should need to - implement :meth:`post_close`. + """Called after :meth:`close` has been invoked for each callback. - This callback can be used to back up any data that may have been written by other callbacks during - :meth:`close`. + Very few callbacks should need to implement :meth:`post_close`. This callback can be used to back up any data + that may have been written by other callbacks during :meth:`close`. """ pass diff --git a/composer/core/data_spec.py b/composer/core/data_spec.py index 6a649bec0f..cae177002a 100644 --- a/composer/core/data_spec.py +++ b/composer/core/data_spec.py @@ -49,8 +49,9 @@ def _split_mapping(m, num_microbatches: int): def _default_split_batch(batch: Any, num_microbatches: int) -> Sequence: - """Splits batch into `num_microbatches` chunks for gradient accumulation. Works with tensors, dictionaries of - tensors, (x, y) tuples, and lists where batch is the 2nd dimension. + """Splits batch into `num_microbatches` chunks for gradient accumulation. + + Works with tensors, dictionaries of tensors, (x, y) tuples, and lists where batch is the 2nd dimension. Args: batch: output from the dataloader. @@ -110,10 +111,6 @@ class DataSpec: ... max_duration="1ep", ... ) - .. testcleanup:: - - trainer.engine.close() - Args: dataloader (Iterable): The dataloader, which can be any iterable that yields batches. @@ -216,7 +213,7 @@ def _default_get_num_tokens_in_batch(self, batch: Batch) -> int: def ensure_data_spec(dataloader: Union[DataSpec, Iterable, dict]) -> DataSpec: - """Ensures that the ``dataloader`` is a :class:`.DataSpec` + """Ensures that the ``dataloader`` is a :class:`.DataSpec`. Args: dataloader (DataSpec | Iterable | dict): A DataSpec, DataLoader, or Dict of DataSpec kwargs. diff --git a/composer/core/engine.py b/composer/core/engine.py index dc4909e30c..d96392e63d 100644 --- a/composer/core/engine.py +++ b/composer/core/engine.py @@ -200,6 +200,7 @@ def run_event( Args: event (Event | str): The current :class:`~.event.Event`. It can be the enum member values or a string with the event value. + Returns: traces (Traces): Ordered dictionary of trace for each algorithm. """ diff --git a/composer/core/evaluator.py b/composer/core/evaluator.py index bcad074c63..1879189947 100644 --- a/composer/core/evaluator.py +++ b/composer/core/evaluator.py @@ -80,11 +80,6 @@ class Evaluator: ... max_duration="1ep", ... ) - .. testcleanup:: - - trainer.engine.close() - - Args: label (str): Name of the Evaluator dataloader (DataSpec | Iterable | Dict[str, Any]): Iterable that yields batches, a :class:`.DataSpec` for evaluation, diff --git a/composer/core/event.py b/composer/core/event.py index a38aad5b0d..616f7e0cc1 100644 --- a/composer/core/event.py +++ b/composer/core/event.py @@ -1,15 +1,17 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 -"""Events represent specific points in the training loop where an :class:`~.core.Algorithm` and :class:`~.core.Callback` -can run.""" +"""Training Loop Events.""" from composer.utils.string_enum import StringEnum __all__ = ["Event"] class Event(StringEnum): - """Enum to represent events in the training loop. + """Enum to represent training loop events. + + Events mark specific points in the training loop where an :class:`~.core.Algorithm` and :class:`~.core.Callback` + can run. The following pseudocode shows where each event fires in the training loop: @@ -167,14 +169,20 @@ class Event(StringEnum): @property def is_before_event(self) -> bool: - """Whether the event is a 'before_*' event (e.g., :attr:`~Event.BEFORE_LOSS`) and has a corresponding 'after_*' - (.e.g., :attr:`~Event.AFTER_LOSS`).""" + """Whether the event is an "before" event. + + An "before" event (e.g., :attr:`~Event.BEFORE_LOSS`) has a corresponding "after" event + (.e.g., :attr:`~Event.AFTER_LOSS`). + """ return self in _BEFORE_EVENTS @property def is_after_event(self) -> bool: - """Whether the event is an 'after_*' event (e.g., :attr:`~Event.AFTER_LOSS`) and has a corresponding 'before_*' - (.e.g., :attr:`~Event.BEFORE_LOSS`).""" + """Whether the event is an "after" event. + + An "after" event (e.g., :attr:`~Event.AFTER_LOSS`) has a corresponding "before" event + (.e.g., :attr:`~Event.BEFORE_LOSS`). + """ return self in _AFTER_EVENTS @property diff --git a/composer/core/precision.py b/composer/core/precision.py index aac96052e3..2ead921d16 100644 --- a/composer/core/precision.py +++ b/composer/core/precision.py @@ -45,7 +45,6 @@ def get_precision_context(precision: Union[str, Precision]) -> Generator[None, N Args: precision (str | Precision): Precision for the context """ - precision = Precision(precision) if precision == Precision.FP32: if torch.cuda.is_available(): diff --git a/composer/core/state.py b/composer/core/state.py index eb7f7bc35c..907c84e95c 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -453,7 +453,6 @@ def load_state_dict(self, state: Dict[str, Any], strict: bool = False): strict (bool): whether the keys in the ``state["model"]`` should perfectly match the keys in the ``self.model``. Defaults to False. """ - state = _ensure_backwards_compatible_checkpointing(state) for attribute_name, serialized_value in state.items(): diff --git a/composer/core/time.py b/composer/core/time.py index f790e5cfa0..99c4d58ac1 100644 --- a/composer/core/time.py +++ b/composer/core/time.py @@ -55,9 +55,8 @@ class TimeUnit(StringEnum): TValue = TypeVar("TValue", int, float) -class Time(Generic[TValue]): - """Time represents static durations of training time or points in the training process in terms of a - :class:`TimeUnit` enum (epochs, batches, samples, tokens, or duration). +class Time(Generic[TValue], Serializable): + """Time represents static durations of training time in terms of a :class:`TimeUnit` enum. See the :doc:`Time Guide ` for more details on tracking time during training. @@ -129,7 +128,9 @@ def __init__( @classmethod def from_epoch(cls, epoch: int) -> Time: - """Create a :class:`Time` with units of :attr:`TimeUnit.EPOCH`. Equivalent to ``Time(epoch, TimeUnit.EPOCH)``. + """Create a :class:`Time` with units of :attr:`TimeUnit.EPOCH`. + + Equivalent to ``Time(epoch, TimeUnit.EPOCH)``. Args: epoch (int): Number of epochs. @@ -141,7 +142,9 @@ def from_epoch(cls, epoch: int) -> Time: @classmethod def from_batch(cls, batch: int) -> Time: - """Create a :class:`Time` with units of :attr:`TimeUnit.BATCH`. Equivalent to ``Time(batch, TimeUnit.BATCH)``. + """Create a :class:`Time` with units of :attr:`TimeUnit.BATCH`. + + Equivalent to ``Time(batch, TimeUnit.BATCH)``. Args: batch (int): Number of batches. @@ -153,8 +156,9 @@ def from_batch(cls, batch: int) -> Time: @classmethod def from_sample(cls, sample: int) -> Time: - """Create a :class:`Time` with units of :attr:`TimeUnit.SAMPLE`. Equivalent to ``Time(sample, - TimeUnit.SAMPLE)``. + """Create a :class:`Time` with units of :attr:`TimeUnit.SAMPLE`. + + Equivalent to ``Time(sample, TimeUnit.SAMPLE)``. Args: sample (int): Number of samples. @@ -166,7 +170,9 @@ def from_sample(cls, sample: int) -> Time: @classmethod def from_token(cls, token: int) -> Time: - """Create a :class:`Time` with units of :attr:`TimeUnit.TOKEN`. Equivalent to ``Time(sample, TimeUnit.TOKEN)``. + """Create a :class:`Time` with units of :attr:`TimeUnit.TOKEN`. + + Equivalent to ``Time(sample, TimeUnit.TOKEN)``. Args: token (int): Number of tokens. @@ -178,8 +184,9 @@ def from_token(cls, token: int) -> Time: @classmethod def from_duration(cls, duration: float) -> Time: - """Create a :class:`Time` with units of :attr:`TimeUnit.DURATION`. Equivalent to ``Time(duration, - TimeUnit.DURATION)``. + """Create a :class:`Time` with units of :attr:`TimeUnit.DURATION`. + + Equivalent to ``Time(duration, TimeUnit.DURATION)``. Args: duration (float): Duration of the training process. Should be on ``[0, 1)`` @@ -326,8 +333,9 @@ def __hash__(self): @classmethod def from_timestring(cls, timestring: str) -> Time: - """Parse a time string into a :class:`Time` instance. A time string is a numerical value followed by the value - of a :class:`TimeUnit` enum. For example: + """Parse a time string into a :class:`Time` instance. + + A time string is a numerical value followed by the value of a :class:`TimeUnit` enum. For example: >>> Time.from_timestring("5ep") # describes 5 epochs. Time(5, TimeUnit.EPOCH) @@ -356,8 +364,10 @@ def from_timestring(cls, timestring: str) -> Time: class Timestamp(Serializable): - """Timestamp represents a snapshot of the current training progress, in terms of epochs, batches, - samples, and tokens. Timestamps are not updated in-place. + """Timestamp represents a snapshot of the current training progress. + + The timestamp measures training progress in terms of epochs, batches, samples, tokens, and wall clock time. + Timestamps are not updated in-place. See the :doc:`Time Guide ` for more details on tracking time during training. @@ -452,6 +462,11 @@ def state_dict(self) -> Dict[str, Any]: } def get_state(self) -> Dict[str, Union[Time[int], datetime.timedelta]]: + """Returns all values of the timestamp object in a dictionary. + + Returns: + Dict[str, Union[Time[int], datetime.timedelta]]: All values of the timestamp object. + """ return { "epoch": self.epoch, "batch": self.batch, @@ -614,7 +629,7 @@ def to_next_batch( tokens: Union[int, Time] = 0, duration: Optional[datetime.timedelta] = None, ): - """Create a new :class:`.Timestamp`, with the batch, sample, and token counts properly incremented. + """Create a new :class:`.Timestamp`, advanced to the next batch. Equivalent to: @@ -670,12 +685,10 @@ def to_next_batch( ) def to_next_epoch(self): - """Create a new :class:`.Timestamp` incremented by one epoch and with - :attr:`batch_in_epoch`, :attr:`sample_in_epoch`, and :attr:`token_in_epoch` reset. + """Create a new :class:`.Timestamp`, advanced to the next epoch. Equivalent to: - .. testsetup:: from composer.core.time import Timestamp @@ -718,8 +731,9 @@ def copy( epoch_wct: Optional[datetime.timedelta] = None, batch_wct: Optional[datetime.timedelta] = None, ) -> Timestamp: - """Create a copy of the timestamp. Any specified values will override the existing values in the - returned copy. + """Create a copy of the timestamp. + + Any specified values will override the existing values in the returned copy. Args: epoch (int | Time[int], optional): The epoch. @@ -764,7 +778,7 @@ def __repr__(self) -> str: def ensure_time(maybe_time: Union[Time, str, int], int_unit: Union[TimeUnit, str]) -> Time: - """Ensure ``maybe_time`` is an instance of :class:`.Time` + """Ensure ``maybe_time`` is an instance of :class:`.Time`. Args: maybe_time (Time | str): A time string, integer, or instance of :class:`.Time`. diff --git a/composer/datasets/streaming/format.py b/composer/datasets/streaming/format.py index f7454f0129..4ff3f60707 100644 --- a/composer/datasets/streaming/format.py +++ b/composer/datasets/streaming/format.py @@ -1,8 +1,7 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 -"""The :class:`StreamingDatsetIndex` format that defines shard/sample metadata for :class:`StreamingDataset`. -""" +"""The :class:`StreamingDatsetIndex` format that defines shard/sample metadata for :class:`StreamingDataset`.""" import math from io import BufferedIOBase, BufferedReader, BufferedWriter, BytesIO @@ -279,7 +278,6 @@ def get_partition(self, world: World, batch_size: Optional[int] = None) -> Tuple min_id (int): The lowest sample ID of this partition. max_id (int): The highest sample ID of this partition. """ - global_device = world.global_device global_num_devices = world.global_num_devices node_worker = world.node_worker diff --git a/composer/datasets/streaming/world.py b/composer/datasets/streaming/world.py index 2912be5b16..4ffda61b6f 100644 --- a/composer/datasets/streaming/world.py +++ b/composer/datasets/streaming/world.py @@ -52,7 +52,6 @@ class World(NamedTuple): def get_world() -> World: """Returns a :class:`World` object, initialized using :mod:`composer.utils.dist` and :func:`torch.utils.data.get_worker_info`""" - # Node and Device info global_node = dist.get_node_rank() global_device = dist.get_global_rank() diff --git a/composer/loggers/__init__.py b/composer/loggers/__init__.py index 1c986593e9..0a931690ff 100644 --- a/composer/loggers/__init__.py +++ b/composer/loggers/__init__.py @@ -4,7 +4,7 @@ """Loggers to store metrics and artifacts. In Composer, algorithms and callbacks can make calls to the :class:`~.logger.Logger`, -which then routes the calls to the appropriate :class:`~.logger_destination.LoggerDestination`\\s. +which then routes the calls to the appropriate :class:`~.logger_destination.LoggerDestination` instances. The :class:`~.logger_destination.LoggerDestination` does the actual logging, for example to a file, or Weights and Biases. diff --git a/composer/loggers/file_logger.py b/composer/loggers/file_logger.py index 0ca63b1b5a..f90254997a 100644 --- a/composer/loggers/file_logger.py +++ b/composer/loggers/file_logger.py @@ -19,7 +19,7 @@ __all__ = ["FileLogger"] -class FileLogger(LoggerDestination): +class FileLogger(LoggerDestination): # noqa: D101 __doc__ = f"""Log data to a file. Example usage: diff --git a/composer/loggers/in_memory_logger.py b/composer/loggers/in_memory_logger.py index a743ddd162..f59fd0c2e4 100644 --- a/composer/loggers/in_memory_logger.py +++ b/composer/loggers/in_memory_logger.py @@ -23,8 +23,9 @@ class InMemoryLogger(LoggerDestination): - """Logs metrics to dictionary objects that persist in memory throughout training. Useful for collecting and plotting - data inside notebooks. + """Logs metrics to dictionary objects that persist in memory throughout training. + + Useful for collecting and plotting data inside notebooks. Example usage: .. testcode:: @@ -46,10 +47,6 @@ class InMemoryLogger(LoggerDestination): # which index in trainer.logger.destinations contains your desired logger. logged_data = trainer.logger.destinations[0].data - .. testcleanup:: - - trainer.engine.close() - Args: log_level (str | LogLevel, optional): :class:`~.logger.LogLevel` (i.e. unit of resolution) at @@ -120,7 +117,6 @@ def get_timeseries(self, metric: str) -> Dict[str, Any]: plt.xlabel("Batch") plt.ylabel("Validation Accuracy") """ - # Check that desired metric is in present data if metric not in self.data.keys(): raise ValueError(f"Invalid value for argument `metric`: {metric}. Requested " diff --git a/composer/loggers/logger.py b/composer/loggers/logger.py index 73e52501f3..0edabb46eb 100644 --- a/composer/loggers/logger.py +++ b/composer/loggers/logger.py @@ -31,6 +31,7 @@ class LogLevel(IntEnum): Logging destinations use the LogLevel to determine whether to record a given metric or state change. + Attributes: FIT: Logged once per training run. EPOCH: Logged once per epoch. @@ -152,7 +153,6 @@ def file_artifact( overwrite (bool, optional): Whether to overwrite an existing artifact with the same ``artifact_name``. (default: ``False``) """ - log_level = LogLevel(log_level) file_path = format_name_with_dist(format_str=str(file_path), run_name=self.run_name) file_path = pathlib.Path(file_path) @@ -185,7 +185,6 @@ def symlink_artifact( overwrite (bool, optional): Whether to overwrite an existing artifact with the same ``symlink_artifact_name``. (default: ``False``) """ - log_level = LogLevel(log_level) for destination in self.destinations: destination.log_symlink_artifact( @@ -197,15 +196,15 @@ def symlink_artifact( ) def data_fit(self, data: Dict[str, Any]) -> None: - """Helper function for ``self.data(LogLevel.FIT, data)``""" + """Helper function for ``self.data(LogLevel.FIT, data)``.""" self.data(LogLevel.FIT, data) def data_epoch(self, data: Dict[str, Any]) -> None: - """Helper function for ``self.data(LogLevel.EPOCH, data)``""" + """Helper function for ``self.data(LogLevel.EPOCH, data)``.""" self.data(LogLevel.EPOCH, data) def data_batch(self, data: Dict[str, Any]) -> None: - """Helper function for ``self.data(LogLevel.BATCH, data)``""" + """Helper function for ``self.data(LogLevel.BATCH, data)``.""" self.data(LogLevel.BATCH, data) @@ -214,6 +213,7 @@ def format_log_data_value(data: Any) -> str: Args: data: Data to format. + Returns: str: ``data`` as a string. """ diff --git a/composer/loggers/logger_destination.py b/composer/loggers/logger_destination.py index aa4ac732e9..3527088891 100644 --- a/composer/loggers/logger_destination.py +++ b/composer/loggers/logger_destination.py @@ -23,9 +23,7 @@ class LoggerDestination(Callback, ABC): :class:`~composer.core.event.Event`. For example, it may be helpful to run on :attr:`~composer.core.event.Event.EPOCH_END` to perform any flushing at the end of every epoch. - Example - ------- - + Example: .. doctest:: >>> from composer.loggers import LoggerDestination @@ -38,10 +36,6 @@ class LoggerDestination(Callback, ABC): ... loggers=[logger] ... ) Batch 0: {'rank_zero_seed': ...} - - .. testcleanup:: - - trainer.engine.close() """ def log_data(self, state: State, log_level: LogLevel, data: Dict[str, Any]): @@ -115,12 +109,15 @@ def log_symlink_artifact( symlink_artifact_name: str, overwrite: bool, ): - """Handle creating a symlink of a file artifact stored at ``existing_artifact_name`` to an artifact named + """Create a symlink. + + of a file artifact stored at ``existing_artifact_name`` to an artifact named ``symlink_artifact_name``. - Subclasses should implement this method to symlink logged files. However, not all loggers need to implement this - method. For example, the :class:`~composer.loggers.tqdm_logger.TQDMLogger` does not implement this method, as it - cannot handle file artifacts and thus does not need to do any special symlinking. + Subclasses should implement this method to create a symlink of a file artifact stored at + ``existing_artifact_name`` to an artifact named ``symlink_artifact_name``.. However, not all loggers need to + implement this method. For example, the :class:`~composer.loggers.tqdm_logger.TQDMLogger` does not implement + this method, as it cannot handle file artifacts and thus does not need to do any special symlinking. .. note:: diff --git a/composer/loggers/logger_hparams_registry.py b/composer/loggers/logger_hparams_registry.py index 9d60548bc0..ab5eb84e90 100644 --- a/composer/loggers/logger_hparams_registry.py +++ b/composer/loggers/logger_hparams_registry.py @@ -31,8 +31,7 @@ @dataclass class ObjectStoreLoggerHparams(hp.Hparams): - """:class:`~composer.loggers.in_memory_logger.InMemoryLogger` - hyperparameters. + """Hyperparameters for the :class:`~.ObjectStoreLogger`. Args: object_store_hparams (LibcloudObjectStoreHparams): The object store provider hparams. diff --git a/composer/loggers/object_store_logger.py b/composer/loggers/object_store_logger.py index 9d813a02cf..474e2ee842 100644 --- a/composer/loggers/object_store_logger.py +++ b/composer/loggers/object_store_logger.py @@ -43,7 +43,7 @@ def _always_log(state: State, log_level: LogLevel, artifact_name: str): class ObjectStoreLogger(LoggerDestination): - """Logger destination that uploads artifacts to an object store. + r"""Logger destination that uploads artifacts to an object store. This logger destination handles calls to :meth:`~composer.loggers.logger.Logger.file_artifact` and uploads files to an object store, such as AWS S3 or Google Cloud Storage. @@ -66,10 +66,6 @@ class ObjectStoreLogger(LoggerDestination): loggers=[object_store_logger], ) - .. testcleanup:: composer.loggers.object_store_logger.ObjectStoreLogger.__init__ - - trainer.engine.close() - .. note:: This callback blocks the training loop to copy each artifact where ``should_log_artifact`` returns ``True``, as @@ -83,7 +79,7 @@ class ObjectStoreLogger(LoggerDestination): always occurs in the background. * Provide a RAM disk path for the ``upload_staging_folder`` parameter. Copying files to stage on RAM will be - faster than writing to disk. However, there must have sufficient excess RAM, or :exc:`MemoryError`\\s may + faster than writing to disk. However, there must have sufficient excess RAM, or :exc:`MemoryError`\s may be raised. Args: @@ -301,8 +297,8 @@ def log_file_artifact(self, state: State, log_level: LogLevel, artifact_name: st def log_symlink_artifact(self, state: State, log_level: LogLevel, existing_artifact_name: str, symlink_artifact_name: str, overwrite: bool): - """Object stores do not natively support symlinks, so we emulate symlinks by adding a .symlink file to the - object store, which is a text file containing the name of the object it is pointing to.""" + # Object stores do not natively support symlinks, so we emulate symlinks by adding a .symlink file to the + # object store, which is a text file containing the name of the object it is pointing to. # Only symlink if we're logging artifact to begin with if not self.should_log_artifact(state, log_level, existing_artifact_name): return @@ -392,8 +388,7 @@ def _upload_worker( container: str, provider_kwargs: Optional[Dict[str, Any]], ): - """A long-running function to handle uploading files to the object store specified by (``provider``, ``container``, - ``provider_kwargs``). + """A long-running function to handle uploading files to the object store. The worker will continuously poll ``file_queue`` for files to upload. Once ``is_finished`` is set, the worker will exit once ``file_queue`` is empty. diff --git a/composer/loggers/wandb_logger.py b/composer/loggers/wandb_logger.py index 06252669e9..36f961e91f 100644 --- a/composer/loggers/wandb_logger.py +++ b/composer/loggers/wandb_logger.py @@ -1,7 +1,7 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 -"""Log to Weights and Biases (https://wandb.ai/)""" +"""Log to `Weights and Biases `_.""" from __future__ import annotations @@ -25,7 +25,7 @@ class WandBLogger(LoggerDestination): - """Log to Weights and Biases (https://wandb.ai/) + """Log to `Weights and Biases `_. Args: project (str, optional): WandB project name. diff --git a/composer/loss/loss.py b/composer/loss/loss.py index 328e7b69b7..fadc461c06 100644 --- a/composer/loss/loss.py +++ b/composer/loss/loss.py @@ -27,9 +27,7 @@ def binary_cross_entropy_with_logits( pos_weight: Optional[Tensor] = None, scale_by_batch_size: Optional[bool] = True, ) -> torch.Tensor: - r"""Replacement for - :class:`~torch.nn.functional.binary_cross_entropy_with_logits` that can handle class - indices or one-hot labels. + r"""Replacement for :class:`~F.binary_cross_entropy_with_logits` that handles class indices or one-hot labels. :class:`~torch.nn.functional.binary_cross_entropy_with_logits` with ``reduction = 'mean'` will typically result in very small loss values (on the order of 1e-3), which @@ -83,8 +81,12 @@ def soft_cross_entropy(input: Tensor, ignore_index: int = -100, reduce: Optional[bool] = None, reduction: str = 'mean'): - r"""Drop-in replacement for :class:`~torch.nn.functional.cross_entropy` that can - handle class indices or one-hot labels. + r"""Drop-in replacement for :class:`~.F.cross_entropy` that handles class indices or one-hot labels. + + .. note:: + + This function will be obsolete with `this update `_. + Args: input (torch.Tensor) : :math:`(N, C)` where `C = number of classes` or :math:`(N, C, H, W)` in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)` where :math:`K \geq 1` @@ -116,7 +118,6 @@ def soft_cross_entropy(input: Tensor, elements in the output, ``'sum'``: the output will be summed. Note: ``size_average`` and ``reduce`` are in the process of being deprecated, and in the meantime, specifying either of those two args will override ``reduction``. Default: ``'mean'`` - This function will be obsolete with `this update `_. """ target_type = infer_target_type(input, target) diff --git a/composer/loss/utils.py b/composer/loss/utils.py index 2d8980b1aa..349cd0496f 100644 --- a/composer/loss/utils.py +++ b/composer/loss/utils.py @@ -34,12 +34,13 @@ def ensure_targets_one_hot(input: torch.Tensor, targets: torch.Tensor, num_classes: Optional[float] = None) -> torch.Tensor: r"""Ensures that the targets are in a one-hot format rather than an index format. + Args: input (torch.Tensor): :math:`(N, C)` where `C = number of classes` or :math:`(N, C, H, W)` in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)` where :math:`K \geq 1` in the case of K-dimensional loss. `input` is expected to contain unnormalized scores (often referred to as logits). - target (torch.Tensor) : If containing class indices, shape :math:`(N)` where each value is + targets (torch.Tensor) : If containing class indices, shape :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss. If containing class probabilities, same shape as the input. diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py index 236cd26c42..9ff1192bc5 100644 --- a/composer/metrics/nlp.py +++ b/composer/metrics/nlp.py @@ -84,7 +84,6 @@ def update(self, output: Union[Mapping, Tensor], target: Tensor) -> None: either the Tensor or a Mapping type that contains the loss or model logits. target (~torch.Tensor): A Tensor of ground-truth values to compare against. """ - assert isinstance(output, Tensor) output = output.view(-1, self.vocab_size) target = target.view(-1) @@ -178,7 +177,6 @@ def update(self, output: Union[Mapping, Tensor], target: Tensor) -> None: either the Tensor or a Mapping type that contains the loss or model logits. target (~torch.Tensor): A Tensor of ground-truth values to compare against. """ - # if logit modification algorithms aren't on, we take the loss directly from the model output if isinstance(output, Mapping) and 'loss' in output: loss = output['loss'] diff --git a/composer/models/efficientnetb0/_layers.py b/composer/models/efficientnetb0/_layers.py index 15d1e23643..ab12aec9c3 100644 --- a/composer/models/efficientnetb0/_layers.py +++ b/composer/models/efficientnetb0/_layers.py @@ -13,8 +13,9 @@ def round_channels( divisor: int = 8, min_value: Optional[int] = None, ) -> int: - """Round number of channels after scaling with width multiplier. This function ensures that channel integers halfway - inbetween divisors is rounded up. + """Round number of channels after scaling with width multiplier. + + This function ensures that channel integers halfway in-between divisors is rounded up. Args: channels (float): Number to round. @@ -23,7 +24,6 @@ def round_channels( min_value (int, optional): Minimum value the output can be. If not specified, defaults to the ``divisor``. """ - if not width_multiplier: return int(channels) channels *= width_multiplier @@ -37,7 +37,6 @@ def round_channels( def calculate_same_padding(kernel_size, dilation, stride): """Calculates the amount of padding to use to get the "SAME" functionality in Tensorflow.""" - return ((stride - 1) + dilation * (kernel_size - 1)) // 2 @@ -49,7 +48,6 @@ def drop_connect(inputs: torch.Tensor, drop_connect_rate: float, training: bool) drop_connect_rate (float): Probability of droppping each sample. training (bool): Whether or not the model is training """ - if not training: return inputs @@ -175,24 +173,26 @@ def forward(self, input: torch.Tensor): class MBConvBlock(nn.Module): - """Mobile Inverted Residual Bottleneck Block as defined in + """Mobile Inverted Residual Bottleneck Block. + + This block is implemented as as defined in `MobileNetV2: Inverted Residuals and Linear Bottlenecks `_ (Sandler et al, 2018). - Args: - in_channels (int): Number of channels in the input tensor. - out_channels (int): Number of channels in the output tensor. - kernel_size (int): Size of the convolving kernel. - stride (int): Stride of the convolution. - expand_ratio (int): How much to expand the input channels for the - depthwise convolution. - se_ratio (float): How much to scale `in_channels` for the hidden layer - dimensionality of the squeeze-excite module. - drop_connect_rate (float): Probability of dropping a sample before the - identity connection, provides regularization similar to stochastic - depth. - act_layer (torch.nn.Module): Activation layer to use in block. - norm_kwargs (dict): Normalization layer's keyword arguments. - norm_layer (torch.nn.Module): Normalization layer to use in block. + Args: + in_channels (int): Number of channels in the input tensor. + out_channels (int): Number of channels in the output tensor. + kernel_size (int): Size of the convolving kernel. + stride (int): Stride of the convolution. + expand_ratio (int): How much to expand the input channels for the + depthwise convolution. + se_ratio (float): How much to scale `in_channels` for the hidden layer + dimensionality of the squeeze-excite module. + drop_connect_rate (float): Probability of dropping a sample before the + identity connection, provides regularization similar to stochastic + depth. + act_layer (torch.nn.Module): Activation layer to use in block. + norm_kwargs (dict): Normalization layer's keyword arguments. + norm_layer (torch.nn.Module): Normalization layer to use in block. """ def __init__(self, diff --git a/composer/models/initializers.py b/composer/models/initializers.py index a2f5c5a0f8..ca7b11d70f 100644 --- a/composer/models/initializers.py +++ b/composer/models/initializers.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Module Initializers.""" + from typing import Callable import torch @@ -19,6 +21,11 @@ class Initializer(StringEnum): LINEAR_LOG_CONSTANT_BIAS = "linear_log_constant_bias" def get_initializer(self) -> Callable[[torch.nn.Module], None]: + """Get the initializer function. + + Returns: + (torch.nn.Module) -> None: The initializer function. + """ def kaiming_normal(w: nn.Module): if isinstance(w, torch.nn.Linear) or isinstance(w, torch.nn.Conv2d): diff --git a/composer/models/model_hparams.py b/composer/models/model_hparams.py index 3c48e026fe..e878e6b375 100644 --- a/composer/models/model_hparams.py +++ b/composer/models/model_hparams.py @@ -35,8 +35,8 @@ class ModelHparams(hp.Hparams, ABC): @abstractmethod def initialize_object(self) -> ComposerModel: - """Invoked by the :meth:`~composer.trainer.trainer_hparams.TrainerHparams.initialize_object` to construct a - :class:`.ComposerModel`. + """ + Construct a :class:`.ComposerModel`. Returns: ComposerModel: The constructed :class:`.ComposerModel` diff --git a/composer/models/transformer_hparams.py b/composer/models/transformer_hparams.py index 4781e11e61..8d719411af 100644 --- a/composer/models/transformer_hparams.py +++ b/composer/models/transformer_hparams.py @@ -1,8 +1,7 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 -"""General `YAHP `_ interface for -ComposerTransformers.""" +"""YAHP :class:`.hp.Hparams` hyperparameters for ComposerTransformers.""" from abc import ABC from dataclasses import dataclass diff --git a/composer/models/transformer_shared.py b/composer/models/transformer_shared.py index 872143690b..3816f30c77 100644 --- a/composer/models/transformer_shared.py +++ b/composer/models/transformer_shared.py @@ -82,7 +82,7 @@ def loss(self, outputs: Mapping, batch: Batch) -> Union[Tensor, Sequence[Tensor] raise NotImplementedError("A model-specific loss function must be written.") def forward(self, batch: Batch) -> Mapping: - """Runs the forward pass of the model. + """Run the forward pass of the model. Args: batch (~composer.core.types.Batch): A dictionary of Dict[str, Tensor] of inputs that the diff --git a/composer/optim/decoupled_weight_decay.py b/composer/optim/decoupled_weight_decay.py index 505ad384fa..bb40101264 100644 --- a/composer/optim/decoupled_weight_decay.py +++ b/composer/optim/decoupled_weight_decay.py @@ -57,7 +57,6 @@ def sgdw(params: List[torch.Tensor], d_p_list: List[torch.Tensor], momentum_buff dampening (float): Dampening factor for momentum update nesterov (bool): Enables Nesterov momentum updates """ - for i, param in enumerate(params): d_p = d_p_list[i] @@ -188,7 +187,6 @@ def adamw(params: List[torch.Tensor], grads: List[torch.Tensor], exp_avgs: List[ weight_decay (float): Factor for decoupled weight decay eps (float): Term added to the denominator to improve numerical stability. """ - for i, param in enumerate(params): grad = grads[i] exp_avg = exp_avgs[i] diff --git a/composer/optim/optimizer_hparams_registry.py b/composer/optim/optimizer_hparams_registry.py index 13c2849281..fcda1091d2 100644 --- a/composer/optim/optimizer_hparams_registry.py +++ b/composer/optim/optimizer_hparams_registry.py @@ -45,8 +45,8 @@ def initialize_object( """Initializes the optimizer. Args: - param_group (Iterable[torch.Tensor] | Iterable[Dict[str, torch.Tensor]]): - Parameters for this optimizer to optimize. + param_group (Iterable[torch.Tensor] | Iterable[Dict[str, torch.Tensor]]): Parameters for + this optimizer to optimize. """ if self.optimizer_cls is None: raise ValueError(f"{type(self).__name__}.optimizer_cls must be defined") diff --git a/composer/optim/scheduler.py b/composer/optim/scheduler.py index 6b8fcf7b49..fad4cbf88d 100644 --- a/composer/optim/scheduler.py +++ b/composer/optim/scheduler.py @@ -164,7 +164,6 @@ def compile_composer_scheduler(scheduler: ComposerScheduler, state: State, ssr: Returns: compiled_scheduler (PyTorchScheduler): The scheduler, in a form compatible with PyTorch scheduler interfaces. """ - optimizers = state.optimizers if len(optimizers) != 1: raise NotImplementedError("Providing functional schedulers is unsupported with multiple optimizers.") @@ -380,7 +379,6 @@ def _cosine_anneal(x: float, min_y: float = 0.0, max_y: float = 1.0) -> float: Curve is cos(x) on domain [0, pi], stretched to the domain [0, 1] and range [min_y, max_y]. Additionally, param x is clipped to the interval [0, 1] """ - x = min(max(x, 0.0), 1.0) return min_y + (max_y - min_y) * (1 + math.cos(x * math.pi)) / 2 diff --git a/composer/profiler/json_trace_handler.py b/composer/profiler/json_trace_handler.py index d237c8c50b..a0426606c5 100644 --- a/composer/profiler/json_trace_handler.py +++ b/composer/profiler/json_trace_handler.py @@ -28,9 +28,11 @@ __all__ = ["JSONTraceHandler"] -class JSONTraceHandler(TraceHandler): - __doc__ = f"""Records trace events in `JSON trace format `_. +class JSONTraceHandler(TraceHandler): # noqa: D101 + __doc__ = f"""Records trace events in Chrome JSON trace format. + + See `this document `_ + for more information. Traces are output to ``output_directory``. Traces can be visualized using the Chrome Trace Viewer. To view in a Google Chrome browser, navigate to ``chrome://tracing`` and load the JSON trace file. diff --git a/composer/profiler/json_trace_merger.py b/composer/profiler/json_trace_merger.py index 58274001fd..09f7e0b7cc 100644 --- a/composer/profiler/json_trace_merger.py +++ b/composer/profiler/json_trace_merger.py @@ -75,7 +75,6 @@ def merge_traces(output_file: Union[str, pathlib.Path], *trace_files: Union[str, output_file (str | pathlib.Path): The file to write the merged trace to trace_files (str | pathlib.Path): Variable number of trace files to merge together """ - ranks_to_clock_sync = _get_rank_to_clock_syncs(trace_files) rank_to_backwards_thread = {} rank_to_seen_threads = {rank: set() for rank in ranks_to_clock_sync.keys()} diff --git a/composer/profiler/marker.py b/composer/profiler/marker.py index 6cb05ef90f..aa4c3ce30a 100644 --- a/composer/profiler/marker.py +++ b/composer/profiler/marker.py @@ -118,7 +118,7 @@ def _record_counter_event(self, wall_clock_time_ns: int, timestamp: Timestamp, def start(self) -> None: """Record the start of a duration event. - To record the duration of an event, invoke :meth:`.Marker.start` followed by :meth:`.Marker.finish`\\: + To record the duration of an event, invoke :meth:`.Marker.start` followed by :meth:`.Marker.finish`. .. testsetup:: diff --git a/composer/profiler/profiler.py b/composer/profiler/profiler.py index 51879bf2fc..9b88a404c7 100644 --- a/composer/profiler/profiler.py +++ b/composer/profiler/profiler.py @@ -171,8 +171,10 @@ def trace_handlers(self, trace_handlers: Union[TraceHandler, Sequence[TraceHandl self._trace_handlers[:] = ensure_tuple(trace_handlers) def record_chrome_json_trace_file(self, filepath: Union[str, pathlib.Path]): - """Record trace events in `Chrome JSON format `_ in the trace handlers. + """Record trace events in Chrome JSON format in the trace handlers. + + See `this document `_ + for more information about Chrome JSON format. .. note:: diff --git a/composer/profiler/profiler_action.py b/composer/profiler/profiler_action.py index 5f3047edc9..f86d39c4d6 100644 --- a/composer/profiler/profiler_action.py +++ b/composer/profiler/profiler_action.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Action states for the :class:`Profiler` that define whether or not events are being recorded to the trace file.""" + from composer.utils.string_enum import StringEnum __all__ = ["ProfilerAction"] diff --git a/composer/profiler/torch_profiler.py b/composer/profiler/torch_profiler.py index c2cd0d0b87..59c265bae4 100644 --- a/composer/profiler/torch_profiler.py +++ b/composer/profiler/torch_profiler.py @@ -25,7 +25,7 @@ __all__ = ["TorchProfiler"] -class TorchProfiler(Callback): +class TorchProfiler(Callback): # noqa: D101 __doc__ = f"""Profile the execution using the :class:`PyTorch Profiler `. Profiling results are stored in TensorBoard format in the directory specified by ``folder``. diff --git a/composer/profiler/trace_handler.py b/composer/profiler/trace_handler.py index 93ae1438d1..4413fa9d3a 100644 --- a/composer/profiler/trace_handler.py +++ b/composer/profiler/trace_handler.py @@ -90,8 +90,10 @@ def process_counter_event( pass def process_chrome_json_trace_file(self, filepath: pathlib.Path) -> None: - """Invoked when there are events in `Chrome JSON format `_ to record. + """Invoked when there are events in Chrome JSON format to record. + + See `this document `_ + for more information. Args: filepath (pathlib.Path): The filepath to a Chrome JSON trace file. diff --git a/composer/trainer/_deepspeed.py b/composer/trainer/_deepspeed.py index 1ab00e3d73..7a7cd742e8 100644 --- a/composer/trainer/_deepspeed.py +++ b/composer/trainer/_deepspeed.py @@ -148,7 +148,6 @@ def _parse_deepspeed_config( to the trainer. RuntimeError: If the batch size of the train dataloader in the provided state is not set. """ - new_config = copy.deepcopy(config) _add_batch_config(new_config, state) _ensure_no_optim_in_config(new_config) @@ -178,7 +177,6 @@ def _fix_batch_precision_for_deepspeed(batch: Batch, precision: Precision) -> Ba Returns: Batch: The batch with it's precision adjusted to the specified precision. """ - if precision != Precision.FP16: return batch diff --git a/composer/trainer/_scaler.py b/composer/trainer/_scaler.py index 02aec05883..f08e5ce488 100644 --- a/composer/trainer/_scaler.py +++ b/composer/trainer/_scaler.py @@ -62,7 +62,6 @@ def step(self, optimizer: Optimizer, *args, **kwargs): Always called before the optimizer step. Checks if the optimizer can handle AMP closures (currently only Composer's SAM optimizer) If so, it passes an AMP-modified closure to the optimizer. """ - closure = kwargs["closure"] def _amp_closure(**kwargs): diff --git a/composer/trainer/devices/device.py b/composer/trainer/devices/device.py index 2b297ca097..204a7a11f7 100644 --- a/composer/trainer/devices/device.py +++ b/composer/trainer/devices/device.py @@ -56,9 +56,10 @@ def tensor_to_device(self, tensor: torch.Tensor) -> torch.Tensor: pass def batch_to_device(self, batch: T_Batch) -> T_Batch: - """Invoked by the :class:`.Trainer` move all tensors items in a batch to device. Supports nested sequences and - mappings of tensors. Ignores non-tensor items. Preserves sequence and types when possible, otherwise converts - sequences to lists. Converts mappings to dictionaries. + """Invoked by the :class:`.Trainer` move all tensors items in a batch to device. + + Supports nested sequences and mappings of tensors. Ignores non-tensor items. Preserves sequence and mapping types + when possible; otherwise, sequences are converted to lists, and mappings are converted to dictionaries. Args: batch (Any): The batch to move to the device. diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 825e3a4f92..bd57dc818f 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1,7 +1,7 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 -"""Train models!""" +"""Train models.""" from __future__ import annotations @@ -483,10 +483,6 @@ class Trainer: load_path=checkpoint_path, load_object_store=store, ) - - .. testcleanup:: - - trainer.engine.close() load_weights_only (bool, optional): Whether or not to only restore the weights from the checkpoint without restoring the associated state. Ignored if ``load_path`` is ``None``. (default: ``False``) load_strict_model_weights (bool, optional): Ensure that the set of weights in the checkpoint and model must exactly match. @@ -920,24 +916,25 @@ def __init__( # Load Checkpoint self._rng_state = None # If autoresume is enabled, first check for existing checkpoints to load - latest_checkpoint_path = self._check_for_autoresume(autoresume=autoresume, - save_folder=save_folder, - save_overwrite=save_overwrite, - save_latest_filename=save_latest_filename, - run_name=run_name, - save_latest_artifact_name=save_latest_artifact_name, - loggers=loggers, - load_chunk_size=load_chunk_size, - load_progress_bar=load_progress_bar) - # Found latest checkpoint path, load that instead - if latest_checkpoint_path: - load_path = latest_checkpoint_path - # Disable object_store and load_weights_only since we're autoresuming locally - load_object_store = None - load_weights_only = False - log.info( - f"Found latest checkpoint at {latest_checkpoint_path}, loading instead of load_path {load_path} as autoresume enabled." - ) + if autoresume: + latest_checkpoint_path = self._determine_autoresume_load_path( + save_folder=save_folder, + save_overwrite=save_overwrite, + save_latest_filename=save_latest_filename, + run_name=run_name, + save_latest_artifact_name=save_latest_artifact_name, + loggers=loggers, + load_chunk_size=load_chunk_size, + load_progress_bar=load_progress_bar) + # Found latest checkpoint path, load that instead + if latest_checkpoint_path: + load_path = latest_checkpoint_path + # Disable object_store and load_weights_only since we're autoresuming locally + load_object_store = None + load_weights_only = False + log.info( + f"Found latest checkpoint at {latest_checkpoint_path}, loading instead of load_path {load_path} as autoresume enabled." + ) # Actually load the checkpoint from potentially updated arguments if load_path is not None: self._rng_state = load_checkpoint(state=self.state, @@ -997,50 +994,58 @@ def saved_checkpoints(self) -> List[Tuple[Timestamp, List[pathlib.Path]]]: return [] return self._checkpoint_saver.saved_checkpoints - def _check_for_autoresume(self, autoresume: bool, save_folder: Optional[str], save_overwrite: bool, - save_latest_filename: str, run_name: Optional[str], save_latest_artifact_name: str, - loggers: List[LoggerDestination], load_chunk_size: int, load_progress_bar: bool): - """If autoresume is enabled, check for latest checkpoint locally. If none is found, check loggers + def _determine_autoresume_load_path( + self, + save_folder: Optional[str], + save_overwrite: bool, + save_latest_filename: str, + run_name: Optional[str], + save_latest_artifact_name: str, + loggers: List[LoggerDestination], + load_chunk_size: int, + load_progress_bar: bool, + ): + """Determines the load path when using autoresume. + + First, check for latest checkpoint locally. If none is found, check loggers for checkpoint. If any checkpoint is found, load that instead of load_path. If none are found, use the user specified load_path. """ - if autoresume: - if save_folder is None: - raise ValueError("save_folder must be specified when autoresume is enabled.") - if save_overwrite: - raise ValueError( - "save_overwrite must be False when autoresume is enabled as autoresume always loads the latest existing checkpoint in save_folder." - ) - if save_latest_filename is None: - raise ValueError( - "save_latest_filename must be specified so autoresume knows where to load checkpoints from.") - if run_name is None: - raise ValueError("run_name must be specified so autoresume knows which run to load from.") - latest_filename_formatted = format_name_with_dist(save_latest_filename, run_name) - latest_checkpoint_path = os.path.join(save_folder, latest_filename_formatted) - # If latest checkpoint is not saved locally, try to fetch from loggers - if not os.path.exists(latest_checkpoint_path) and save_latest_artifact_name is not None: - # Make save folder in case it doesn't exist so latest checkpoint can be downloaded - os.makedirs(save_folder, exist_ok=True) - for logger in loggers: - try: - # Fetch from logger. If it succeeds, stop trying the rest of the loggers - logger.get_file_artifact(artifact_name=format_name_with_dist( - save_latest_artifact_name, run_name), - destination=latest_checkpoint_path, - chunk_size=load_chunk_size, - progress_bar=load_progress_bar) - break - except (NotImplementedError, GetFileNotFoundException): - # Ignore errors caused by no checkpoint saved with logger - pass - # Require all ranks to have local checkpoint if we wish to restore from it - latest_checkpoint_exists = self._device.tensor_to_device( - torch.tensor([os.path.exists(latest_checkpoint_path)], dtype=torch.uint8)) - dist.all_reduce(latest_checkpoint_exists, reduce_operation="MIN") - # If latest checkpoint is saved locally, change load_path to it - if int(latest_checkpoint_exists.item()) == 1: - return latest_checkpoint_path + if save_folder is None: + raise ValueError("save_folder must be specified when autoresume is enabled.") + if save_overwrite: + raise ValueError( + "save_overwrite must be False when autoresume is enabled as autoresume always loads the latest existing checkpoint in save_folder." + ) + if save_latest_filename is None: + raise ValueError( + "save_latest_filename must be specified so autoresume knows where to load checkpoints from.") + if run_name is None: + raise ValueError("run_name must be specified so autoresume knows which run to load from.") + latest_filename_formatted = format_name_with_dist(save_latest_filename, run_name) + latest_checkpoint_path = os.path.join(save_folder, latest_filename_formatted) + # If latest checkpoint is not saved locally, try to fetch from loggers + if not os.path.exists(latest_checkpoint_path) and save_latest_artifact_name is not None: + # Make save folder in case it doesn't exist so latest checkpoint can be downloaded + os.makedirs(save_folder, exist_ok=True) + for logger in loggers: + try: + # Fetch from logger. If it succeeds, stop trying the rest of the loggers + logger.get_file_artifact(artifact_name=format_name_with_dist(save_latest_artifact_name, run_name), + destination=latest_checkpoint_path, + chunk_size=load_chunk_size, + progress_bar=load_progress_bar) + break + except (NotImplementedError, GetFileNotFoundException): + # Ignore errors caused by no checkpoint saved with logger + pass + # Require all ranks to have local checkpoint if we wish to restore from it + latest_checkpoint_exists = self._device.tensor_to_device( + torch.tensor([os.path.exists(latest_checkpoint_path)], dtype=torch.uint8)) + dist.all_reduce(latest_checkpoint_exists, reduce_operation="MIN") + # If latest checkpoint is saved locally, change load_path to it + if int(latest_checkpoint_exists.item()) == 1: + return latest_checkpoint_path def fit( self, @@ -1173,8 +1178,7 @@ def fit( If ``reset_time`` is True, then :attr:`.State.max_duration` will be set to this parameter. optimizers (torch.optim.Optimizer | Sequence[torch.optim.Optimizer], optional): See :class:`.Trainer`. - schedulers (PyTorchScheduler | ComposerScheduler | Sequence[PyTorchScheduler | ComposerScheduler], optional): - See :class:`.Trainer`. + schedulers (PyTorchScheduler | ComposerScheduler | Sequence[PyTorchScheduler | ComposerScheduler], optional): See :class:`.Trainer`. scale_schedule_ratio (float, optional): See :class:`.Trainer`. step_schedulers_every_batch (bool, optional): See :class:`.Trainer`. eval_dataloader (Iterable | DataSpec | Evaluator | Sequence[Evaluator], optional): See :class:`.Trainer`. @@ -1552,7 +1556,6 @@ def _train_loop(self) -> None: def _run_evaluators(self, event: Event, log_level: LogLevel): """Runs evaluators periodically during training.""" - for evaluator in self.state.evaluators: assert evaluator.eval_interval is not None, "eval_interval should have been set on __init__() or fit()" assert evaluator.subset_num_batches is not None, "subset_num_batches should have been set on __init__() or fit()" @@ -1566,8 +1569,9 @@ def _run_evaluators(self, event: Event, log_level: LogLevel): ) def _train_batch(self, use_grad_scaling: bool): - """Compute loss by training on a full batch of data. Adaptively change microbatch size if enabled to maximize - GPU usage. + """Compute loss by training on a full batch of data. + + Adaptively change microbatch size if enabled to maximize GPU usage. Args: use_grad_scaling (bool): Enables gradient scaling @@ -1699,6 +1703,7 @@ def _train_microbatch(self, use_grad_scaling: bool, current_batch_size: int, tot Args: use_grad_scaling (bool): Whether to use gradient scaling. + current_batch_size (int): The current batch size. minibatch_num_samples (int): Number of samples in the minibatch. total_loss (torch.Tensor): Total loss aggregated across all microbatches. is_final_microbatch (bool): If current microbatch is the last one. @@ -1775,7 +1780,6 @@ def predict(self, dataloader: Union[DataLoader, DataSpec], subset_num_batches: i on this many batches. This parameter has no effect if it is greater than ``len(dataloader)``. If ``-1``, then the entire loader will be iterated over. (default: ``-1``) """ - if isinstance(dataloader, DataSpec): data_spec = dataloader else: diff --git a/composer/trainer/trainer_hparams.py b/composer/trainer/trainer_hparams.py index 68110a6fb0..8655e9a4a1 100644 --- a/composer/trainer/trainer_hparams.py +++ b/composer/trainer/trainer_hparams.py @@ -879,8 +879,7 @@ class ExperimentHparams(hp.Hparams): evals: List[EvalHparams] = hp.optional("Eval hparams", default_factory=list) def initialize_object(self) -> Tuple[Trainer, List[FitKwargs], List[EvalKwargs]]: - """Construct the :class:`.Trainer`, :meth:`~Trainer.fit` kwargs, and - :meth:`~Trainer.eval` kwargs. + """Construct the :class:`.Trainer`, :meth:`~Trainer.fit` kwargs, and :meth:`~Trainer.eval` kwargs. Returns: Tuple[Trainer, List[FitKwargs], List[EvalKwargs]]: A tuple of the diff --git a/composer/utils/batch_helpers.py b/composer/utils/batch_helpers.py index 9c1afd5d1c..c3e066c94c 100644 --- a/composer/utils/batch_helpers.py +++ b/composer/utils/batch_helpers.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Helpers to get items and set items in a batch.""" + from operator import attrgetter, itemgetter from typing import Any, Callable, Sequence, Union @@ -170,7 +172,7 @@ def _batch_set_multiple(batch: Any, key: Any, value: Any) -> Any: def _batch_set_tuple(batch: Any, key: Union[int, str], value: Any) -> Any: - """"Sets key value pairs in tuples and NamedTuples.""" + """Sets key value pairs in tuples and NamedTuples.""" if hasattr(batch, '_fields'): # NamedTuple if isinstance(key, str): batch = batch._replace(**{key: value}) diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py index 0e4e7d1342..9a4bcf01a5 100644 --- a/composer/utils/checkpoint.py +++ b/composer/utils/checkpoint.py @@ -293,11 +293,13 @@ def _restore_checkpoint( return state_dict['rng'] -def save_checkpoint(state: State, - logger: Logger, - filename: str = "ep{epoch}-ba{batch}-rank{rank}", - *, - weights_only: bool = False) -> List[pathlib.Path]: +def save_checkpoint( + state: State, + logger: Logger, + filename: str = "ep{epoch}-ba{batch}-rank{rank}", + *, + weights_only: bool = False, +) -> List[pathlib.Path]: # noqa: D103 state_dict = { 'state': state.state_dict(), 'rng': reproducibility.get_rng_state(), diff --git a/composer/utils/collect_env.py b/composer/utils/collect_env.py index 921651197b..41cfd4fdb3 100644 --- a/composer/utils/collect_env.py +++ b/composer/utils/collect_env.py @@ -111,7 +111,7 @@ def get_host_processor_name() -> str: def get_host_processor_cores() -> int: - """Determine the number of physical host processor cores.""" + """Determines the number of physical host processor cores.""" return psutil.cpu_count(logical=False) @@ -126,7 +126,7 @@ def get_accel_model_name() -> str: def get_local_world_size() -> int: - """Determine the number of accelerators per node.""" + """Determines the number of accelerators per node.""" return dist.get_local_world_size() if cuda_available() else 0 @@ -140,7 +140,7 @@ def get_cuda_device_count() -> int: def _exc_report(exc_type) -> None: - """Produces exception report (exception message + environment report) + """Produces exception report (exception message + environment report). Args: exc_type (Exception): Type of exception. @@ -230,7 +230,6 @@ def configure_excepthook() -> None: >>> sys.excepthook """ - global _EXCEPTHOOK_REGISTERED # Needs to be indempotent across multiple trainers, don't register if we've already registered if not _EXCEPTHOOK_REGISTERED: @@ -268,7 +267,6 @@ def get_torch_env() -> str: # Get Composer environment info def get_composer_env() -> str: """Query Composer pertinent system information.""" - mutable_dict = ComposerEnv( composer_version=get_composer_version(), host_processor_model_name=get_host_processor_name(), @@ -287,7 +285,6 @@ def print_env(file: Optional[TextIO] = None) -> None: """Generate system information report. Example: - .. code-block:: python from composer.utils.collect_env import print_env @@ -363,7 +360,6 @@ def print_env(file: Optional[TextIO] = None) -> None: Args: file (TextIO, optional): File handle, `sys.stdout` or `sys.stderr`. Defaults to `sys.stdout`. """ - # Set stdout during runtime if no output file is specified if file is None: file = sys.stdout diff --git a/composer/utils/dist.py b/composer/utils/dist.py index a128061e59..4088658e15 100644 --- a/composer/utils/dist.py +++ b/composer/utils/dist.py @@ -130,7 +130,9 @@ def get_local_rank() -> int: def get_node_rank() -> int: - """Returns the node rank. For example, if there are 2 nodes, and 2 ranks per node, then global ranks 0-1 will have a + """Returns the node rank. + + For example, if there are 2 nodes, and 2 ranks per node, then global ranks 0-1 will have a node rank of 0, and global ranks 2-3 will have a node rank of 1. Returns: @@ -239,6 +241,7 @@ def broadcast_object_list(object_list: List[Any], src: int = 0) -> None: Each object must be picklable. Only objects on the ``src`` rank will be broadcast, but each rank must provide lists of equal sizes. src (int, optional): Source rank (default: ``0``) + Returns: None: ``object_list`` will be modified in-place and set to values of ``object_list`` from the ``src`` rank. """ @@ -257,8 +260,7 @@ def broadcast_object_list(object_list: List[Any], src: int = 0) -> None: def all_gather(tensor: torch.Tensor) -> Sequence[torch.Tensor]: - """Collects a :class:`~torch.Tensor` from each rank and return a sequence of - :class:`~torch.Tensor`\\s indexed by rank. + """Collects a :class:`~torch.Tensor` from each rank. .. seealso:: :func:`torch.distributed.all_gather` @@ -350,7 +352,7 @@ def initialize_dist(backend: str, timeout: datetime.timedelta): Args: backend (str): The distributed backend to use. Should be ``gloo`` for CPU training, or ``nccl`` for GPU training. - timeout (datetime.timedelta): The timeout for operations exected against the process group. + timeout (datetime.timedelta): The timeout for operations executed against the process group. """ if get_world_size() > 1 and not dist.is_available(): raise RuntimeError("When the world size is > 1, ``torch.distributed`` must be used. However, it is " @@ -392,13 +394,15 @@ def initialize_dist(backend: str, timeout: datetime.timedelta): def get_sampler(dataset: torch.utils.data.Dataset, *, drop_last: bool, shuffle: bool): - """Constructs a :class:`~torch.utils.data.distributed.DistributedSampler` for a dataset. The - :class:`~torch.utils.data.distributed.DistributedSampler` assumes that each rank has a complete copy of the dataset. - It ensures that each rank sees a unique shard for each epoch containing ``len(datset) / get_world_size()`` samples. + """Constructs a :class:`~torch.utils.data.distributed.DistributedSampler` for a dataset. + + The :class:`~torch.utils.data.distributed.DistributedSampler` assumes that each rank has a complete copy of the + dataset. It ensures that each rank sees a unique shard for each epoch containing + ``len(dataset) / get_world_size()`` samples. .. note:: - If the ``dataset`` is already shareded by rank, use a :class:`~torch.utils.data.SequentialSampler` + If the ``dataset`` is already sharded by rank, use a :class:`~torch.utils.data.SequentialSampler` or :class:`~torch.utils.data.RandomSampler`. Args: diff --git a/composer/utils/file_helpers.py b/composer/utils/file_helpers.py index cf3dfce9f7..774321849f 100644 --- a/composer/utils/file_helpers.py +++ b/composer/utils/file_helpers.py @@ -69,9 +69,11 @@ def ensure_folder_is_empty(folder_name: Union[str, pathlib.Path]): def ensure_folder_has_no_conflicting_files(folder_name: Union[str, pathlib.Path], filename: str, timestamp: Timestamp): - """Ensure that the given folder does not have any files conflicting with the ``filename`` format string. If any - filename is formatted with a timestamp where the epoch, batch, sample, or token counts are after ``timestamp``, a - ``FileExistsError`` will be raised. ``filename`` and occurs later than ``timestamp``, raise a ``FileExistsError``. + """Ensure that the given folder does not have any files conflicting with the ``filename`` format string. + + If any filename is formatted with a timestamp where the epoch, batch, sample, or token counts are after + ``timestamp``, a ``FileExistsError`` will be raised. + If ``filename`` and occurs later than ``timestamp``, raise a ``FileExistsError``. Args: folder_name (str | pathlib.Path): The folder to inspect. @@ -154,7 +156,7 @@ def ensure_folder_has_no_conflicting_files(folder_name: Union[str, pathlib.Path] """ -def format_name_with_dist(format_str: str, run_name: str, **extra_format_kwargs: object): +def format_name_with_dist(format_str: str, run_name: str, **extra_format_kwargs: object): # noqa: D103 formatted_str = format_str.format( run_name=run_name, rank=dist.get_global_rank(), @@ -246,7 +248,12 @@ def format_name_with_dist(format_str: str, run_name: str, **extra_format_kwargs: """ -def format_name_with_dist_and_time(format_str: str, run_name: str, timestamp: Timestamp, **extra_format_kwargs: object): +def format_name_with_dist_and_time( + format_str: str, + run_name: str, + timestamp: Timestamp, + **extra_format_kwargs: object, +): # noqa: D103 formatted_str = format_str.format( run_name=run_name, rank=dist.get_global_rank(), diff --git a/composer/utils/import_helpers.py b/composer/utils/import_helpers.py index 01f5efdedd..2448fd112b 100644 --- a/composer/utils/import_helpers.py +++ b/composer/utils/import_helpers.py @@ -10,16 +10,16 @@ class MissingConditionalImportError(ImportError): + """Handles errors for external packages that might not be installed. + + Args: + extra_deps_group (str): the pip package group, found in setup.py. For example, nlp for `mosaicml[nlp]`. + conda_package (str, optional): The package(s) to install if using conda. + conda_channel (str, optional): The conda channel to install packages from. Set to ``None`` if the + package is not published on conda and must be installed via pip. + """ def __init__(self, extra_deps_group: str, conda_package: str, conda_channel: Optional[str] = 'conda-forge'): - """Handles errors for external packages that might not be installed. - - Args: - extra_deps_group (str): the pip package group, found in setup.py. For example, nlp for `mosaicml[nlp]`. - conda_package (str, optional): The package(s) to install if using conda. - conda_channel (str, optional): The conda channel to install packages from. Set to ``None`` if the - package is not published on conda and must be installed via pip. - """ if conda_channel: conda_command = f"conda install -c {conda_channel} {conda_package}" else: diff --git a/composer/utils/iter_helpers.py b/composer/utils/iter_helpers.py index c4f1aab79e..1414733c26 100644 --- a/composer/utils/iter_helpers.py +++ b/composer/utils/iter_helpers.py @@ -11,7 +11,7 @@ def map_collection(collection, map_fn): - """Apply ``map_fn`` on each element in ``collection``. + """Applies ``map_fn`` on each element in ``collection``. * If ``collection`` is a tuple or list of elements, ``map_fn`` is applied on each element, and a tuple or list, respectively, containing mapped values is returned. diff --git a/composer/utils/libcloud_object_store.py b/composer/utils/libcloud_object_store.py index c29ccbfd1a..b62c0276e5 100644 --- a/composer/utils/libcloud_object_store.py +++ b/composer/utils/libcloud_object_store.py @@ -142,7 +142,9 @@ def upload_object_via_stream(self, headers=headers) def _get_object(self, object_name: str): - """Get object from object store. Recursively follow any symlinks. If an object does not exist, automatically + """Get object from object store. + + Recursively follow any symlinks. If an object does not exist, automatically checks if it is a symlink by appending ``.symlink``. Args: diff --git a/composer/utils/libcloud_object_store_hparams.py b/composer/utils/libcloud_object_store_hparams.py index 7c012860e2..f55c15025a 100644 --- a/composer/utils/libcloud_object_store_hparams.py +++ b/composer/utils/libcloud_object_store_hparams.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Hyperparameters for the :class:`~.LibcloudObjectStore`.""" + import dataclasses import os from typing import Any, Dict, Optional @@ -12,7 +14,7 @@ @dataclasses.dataclass class LibcloudObjectStoreHparams(hp.Hparams): - """:class:`~composer.utils.libcloud_object_store.LibcloudObjectStore` hyperparameters. + """:class:`~.LibcloudObjectStore` hyperparameters. .. rubric:: Example @@ -145,7 +147,6 @@ def initialize_object(self): Returns: LibcloudObjectStore: The object_store. """ - return LibcloudObjectStore( provider=self.provider, container=self.container, diff --git a/composer/utils/module_surgery.py b/composer/utils/module_surgery.py index 99c106f1e1..fb9469f59b 100644 --- a/composer/utils/module_surgery.py +++ b/composer/utils/module_surgery.py @@ -18,8 +18,7 @@ module (torch.nn.Module): Source module module_index (int): The i-th instance of module class. - Returns: - Optional[torch.nn.Module]: The replacement module, or ``None`` to indicate no modification. + Returns: Optional[torch.nn.Module]: The replacement module, or ``None`` to indicate no modification. """ import collections import itertools @@ -238,8 +237,9 @@ def count_module_instances(module: torch.nn.Module, module_class: Union[Type[tor def _tensor_in(tensor: torch.Tensor, iterable: Iterable[torch.Tensor]): - """Returns whether `tensor is element` for any element in `iterable` This function is necessary because `tensor in - iterable` does not work reliably for `Tensor`s. + """Returns whether ``tensor is element`` for any element in ``iterable``. + + This function is necessary because ``tensor in iterable`` does not work reliably for :class:`.Tensor` objects. See https://discuss.pytorch.org/t/how-to-judge-a-tensor-is-in-a-list/15998/4 for further discussion. @@ -248,7 +248,8 @@ def _tensor_in(tensor: torch.Tensor, iterable: Iterable[torch.Tensor]): def _find_param_in_optimizer(param: torch.nn.parameter.Parameter, optimizer: Optimizer) -> int: - """Returns the index of the optimizer ``param_group`` containing ``param`` + """Returns the index of the optimizer ``param_group`` containing ``param``. + Optimizers store their parameters within an iterable of ``dict``s called :attr:`~torch.optim.Optimizer.param_groups`. By default, there is only one group in :attr:`~torch.optim.Optimizer.param_groups` @@ -278,19 +279,19 @@ def _find_param_in_optimizer(param: torch.nn.parameter.Parameter, optimizer: Opt def update_params_in_optimizer(old_params: Iterable[torch.nn.parameter.Parameter], new_params: Iterable[torch.nn.parameter.Parameter], optimizers: Union[Optimizer, Sequence[Optimizer]]) -> None: - """Remove ``old_params`` from the ``optimizers`` and insert ``new_params``. + r"""Remove ``old_params`` from the ``optimizers`` and insert ``new_params``. Newly added parameters will be added to the same :attr:`~torch.optim.Optimizer.param_group` as the removed parameters. A :class:`RuntimeError` will be raised if ``old_params`` is split across multiple parameter groups. This function differs from :meth:`replace_params_in_optimizer` in that ``len(old_params)`` need not equal - ``len(new_params)``. However, this function does not support replacing parameters accross multiple optimizer + ``len(new_params)``. However, this function does not support replacing parameters across multiple optimizer groups. .. warning:: Dynamically removing parameters from a :class:`~torch.optim.Optimizer` and adding parameters - to an existing :attr:`~torch.optim.Optimizer.param_group`\\s are not officially supported, so this + to an existing :attr:`~torch.optim.Optimizer.param_group`\s are not officially supported, so this function may fail when PyTorch is updated. The `recommended practice `_ is to instead recreate the optimizer when the parameter set changes diff --git a/composer/utils/reproducibility.py b/composer/utils/reproducibility.py index 3e59377417..b52cb6675b 100644 --- a/composer/utils/reproducibility.py +++ b/composer/utils/reproducibility.py @@ -39,7 +39,6 @@ .. testcleanup:: - trainer.engine.close() warnings.resetwarnings() Attributes: @@ -95,7 +94,6 @@ def configure_deterministic_mode(): .. testcleanup:: - trainer.engine.close() warnings.resetwarnings() However, to configure deterministic mode for operations before the trainer is initialized, manually invoke this @@ -147,10 +145,6 @@ def seed_all(seed: int): >>> trainer = Trainer(seed=42) - .. testcleanup:: - - trainer.engine.close() - However, to configure the random seed for operations before the trainer is initialized, manually invoke this function at the beginning of your training script. @@ -173,7 +167,6 @@ def get_rng_state() -> List[Dict[str, Any]]: Returns: List[Dict[str, Any]]: A list of RNG State Dicts, indexed by global rank. """ - rng_state = { "python": random.getstate(), "numpy": np.random.get_state(), diff --git a/docker/pytorch/generate_build_matrix.py b/docker/pytorch/generate_build_matrix.py index 80d96b98ee..67fc93044c 100644 --- a/docker/pytorch/generate_build_matrix.py +++ b/docker/pytorch/generate_build_matrix.py @@ -1,14 +1,13 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 -""" -Helper script to generate the build_matrix.yaml +"""Helper script to generate the ``build_matrix.yaml`` and update ``README.md``. -Note: this script requires tabulate. Run `pip install tabulate` if not installed +Note: this script requires tabulate. Run ``pip install tabulate`` if not installed -To run: python generate_build_matrix.py +To run:: -Also update the `README.md` in the docker folder with the resulting table. + python generate_build_matrix.py """ import itertools @@ -20,7 +19,7 @@ import yaml -def get_pytorch_version(python_version: str): +def _get_pytorch_version(python_version: str): if python_version == "3.9": return "1.11.0" if python_version in "3.8": @@ -30,7 +29,7 @@ def get_pytorch_version(python_version: str): raise ValueError(f"Invalid python version: {python_version}") -def get_torchvision_version(pytorch_version: str): +def _get_torchvision_version(pytorch_version: str): if pytorch_version == "1.10.2": return "0.11.3" if pytorch_version == "1.11.0": @@ -40,7 +39,7 @@ def get_torchvision_version(pytorch_version: str): raise ValueError(f"Invalid pytorch_version: {pytorch_version}") -def get_base_image(cuda_version: str): +def _get_base_image(cuda_version: str): if cuda_version == "cpu": return "ubuntu:20.04" if cuda_version == "11.1.1": @@ -50,7 +49,7 @@ def get_base_image(cuda_version: str): raise ValueError(f"Invalid cuda_version: {cuda_version}") -def get_cuda_version(pytorch_version: str, use_cuda: bool): +def _get_cuda_version(pytorch_version: str, use_cuda: bool): if not use_cuda: return "cpu" if pytorch_version == "1.9.1": @@ -60,7 +59,7 @@ def get_cuda_version(pytorch_version: str, use_cuda: bool): raise ValueError(f"Invalid pytorch_version: {str}") -def get_cuda_version_tag(cuda_version: str): +def _get_cuda_version_tag(cuda_version: str): if cuda_version == "cpu": return "cpu" if cuda_version == "11.1.1": @@ -70,7 +69,7 @@ def get_cuda_version_tag(cuda_version: str): raise ValueError(f"Invalid cuda_version: {cuda_version}") -def get_tags(python_version: str, pytorch_version: str, cuda_version_tag: str, cuda_version: str, stage: str): +def _get_tags(python_version: str, pytorch_version: str, cuda_version_tag: str, cuda_version: str, stage: str): if stage == "pytorch_stage": base_image_name = "mosaicml/pytorch" elif stage == "vision_stage": @@ -88,7 +87,7 @@ def get_tags(python_version: str, pytorch_version: str, cuda_version_tag: str, c return tags -def main(): +def _main(): python_versions = ["3.7", "3.8", "3.9"] cuda_options = [True, False] stages = ["pytorch_stage", "vision_stage"] @@ -98,13 +97,13 @@ def main(): for product in itertools.product(python_versions, cuda_options, stages): python_version, use_cuda, stage = product - pytorch_version = get_pytorch_version(python_version) - cuda_version = get_cuda_version(pytorch_version=pytorch_version, use_cuda=use_cuda) - cuda_version_tag = get_cuda_version_tag(cuda_version) + pytorch_version = _get_pytorch_version(python_version) + cuda_version = _get_cuda_version(pytorch_version=pytorch_version, use_cuda=use_cuda) + cuda_version_tag = _get_cuda_version_tag(cuda_version) entry = { "BASE_IMAGE": - get_base_image(cuda_version), + _get_base_image(cuda_version), "CUDA_VERSION": cuda_version, "CUDA_VERSION_TAG": @@ -118,9 +117,9 @@ def main(): "TARGET": stage, "TORCHVISION_VERSION": - get_torchvision_version(pytorch_version), + _get_torchvision_version(pytorch_version), "TAGS": - get_tags( + _get_tags( python_version=python_version, pytorch_version=pytorch_version, cuda_version_tag=cuda_version_tag, @@ -184,4 +183,4 @@ def main(): if __name__ == "__main__": - main() + _main() diff --git a/docker/pytorch/pillow_stub/setup.py b/docker/pytorch/pillow_stub/setup.py index 835d501d9d..502b19b117 100644 --- a/docker/pytorch/pillow_stub/setup.py +++ b/docker/pytorch/pillow_stub/setup.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Setup script for the stub pillow.""" + import os from setuptools import setup diff --git a/docs/source/conf.py b/docs/source/conf.py index fc8cfeea97..b3441c5509 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,19 +1,19 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html +"""Configuration file for the Sphinx documentation builder. + +This file only contains a selection of the most common options. For a full +list see the documentation: +https://www.sphinx-doc.org/en/master/usage/configuration.html -# -- Path setup -------------------------------------------------------------- +-- Path setup -------------------------------------------------------------- +If extensions (or modules to document with autodoc) are in another directory, +add these directories to sys.path here. If the directory is relative to the +documentation root, use os.path.abspath to make it absolute, like shown here. +""" import ast -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# import importlib import inspect import json @@ -92,7 +92,7 @@ def _get_commit_sha() -> str: - """Determine the commit sha. + """Determines the commit sha. Returns: str: The git commit sha, as a string. @@ -253,7 +253,7 @@ def skip_redundant_namedtuple_attributes( skip: bool, options: sphinx.ext.autodoc.Options, ): - # Hide the default, duplicate attributes for named tuples + """Hide the default, duplicate attributes for named tuples.""" del app, what, name, skip, options if '_tuplegetter' in obj.__class__.__name__: return True @@ -279,7 +279,6 @@ def determine_sphinx_path(item: Union[Type[object], Type[BaseException], types.M or could be a very unlikely edge condition where a public item in a private module is reimported only by sibling module(s), not any (grand)parents. """ - # Check to see if `item` is itself private if item.__name__.startswith("_"): public_name = item.__name__ @@ -320,7 +319,7 @@ def add_module_summary_tables( options: sphinx.ext.autodoc.Options, lines: List[str], ): - """This hook adds in summary tables for each module, documenting all functions, exceptions, classes, and attributes. + """Add summary tables for each module, documenting all functions, exceptions, classes, and attributes. It links reimported imports to their original source, as not to create a duplicate, indexed toctree entry. It automatically inserts itself at the end of each module docstring. @@ -430,9 +429,7 @@ def add_module_summary_tables( def rstjinja(app, docname, source): - """ - Render our pages as a jinja template for fancy templating goodness. - """ + """Render our pages as a jinja template for fancy templating goodness.""" # Make sure we're outputting HTML if app.builder.format != 'html': return @@ -442,6 +439,7 @@ def rstjinja(app, docname, source): def get_algorithms_metadata() -> Dict[str, Dict[str, str]]: + """Get the metadata for algorithms from the ``metadata.json`` files.""" EXCLUDE = ['no_op_model'] root = os.path.join(os.path.dirname(__file__), '..', '..', 'composer', 'algorithms') @@ -479,7 +477,7 @@ def get_algorithms_metadata() -> Dict[str, Dict[str, str]]: line_to_delete = _('Bases: %s') % u':py:class:`object`' -def add_line_no_object_base(self, text, *args, **kwargs): +def _add_line_no_object_base(self, text, *args, **kwargs): if text.strip() == line_to_delete: return @@ -489,8 +487,9 @@ def add_line_no_object_base(self, text, *args, **kwargs): add_directive_header = ClassDocumenter.add_directive_header -def add_directive_header_no_object_base(self, *args, **kwargs): - self.add_line = add_line_no_object_base.__get__(self) +def _add_directive_header_no_object_base(self, *args, **kwargs): + """Hide that all classes inherit from the base class ``object``.""" + self.add_line = _add_line_no_object_base.__get__(self) result = add_directive_header(self, *args, **kwargs) @@ -499,7 +498,7 @@ def add_directive_header_no_object_base(self, *args, **kwargs): return result -ClassDocumenter.add_directive_header = add_directive_header_no_object_base +ClassDocumenter.add_directive_header = _add_directive_header_no_object_base def _recursive_getattr(obj: Any, path: str): @@ -531,6 +530,7 @@ def _determine_lineno_of_attribute(module: types.ModuleType, attribute: str): def linkcode_resolve(domain: str, info: Dict[str, str]): + """Adds links to the GitHub source code in the API Reference.""" assert domain == "py", f"unsupported domain: {domain}" module_name = info['module'] @@ -605,6 +605,7 @@ def visit_reference(self, node: Element) -> None: def setup(app: sphinx.application.Sphinx): + """Setup hook.""" app.connect('autodoc-skip-member', skip_redundant_namedtuple_attributes) app.connect('autodoc-process-docstring', add_module_summary_tables) app.connect('source-read', rstjinja) diff --git a/docs/source/doctest_cleanup.py b/docs/source/doctest_cleanup.py index adc827a052..297ea89152 100644 --- a/docs/source/doctest_cleanup.py +++ b/docs/source/doctest_cleanup.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Cleanup script that is executed at the end of each doctest.""" + import os # variables are defined in doctest_fixtures.py diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index a412f9f7ff..49c54b8f6d 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -4,8 +4,7 @@ # disabling general type issues because of monkeypatching #yright: reportGeneralTypeIssues=none -""" -Fixtures available in doctests. +"""Fixtures available in doctests. The script is run before any doctests are executed, so all imports and variables are available in any doctest. @@ -134,6 +133,7 @@ def loss_fun(output, target, reduction="none"): + """Dummy loss function.""" return torch.ones_like(target) diff --git a/docs/source/getting_started/welcome_tour.rst b/docs/source/getting_started/welcome_tour.rst index a28c85fa9a..00f72c2e49 100644 --- a/docs/source/getting_started/welcome_tour.rst +++ b/docs/source/getting_started/welcome_tour.rst @@ -110,7 +110,7 @@ simple: class MixUp(Algorithm): def match(self, event: Event, state: State) -> bool: - """Determine whether the algorithm should run on a given event.""" + """Determines whether the algorithm should run on a given event.""" return event in [Event.AFTER_DATALOADER, Event.AFTER_LOSS] def apply(self, event: Event, state: State, logger: Logger) -> None: diff --git a/docs/source/tables/__init__.py b/docs/source/tables/__init__.py index cba5725129..b30b18379a 100644 --- a/docs/source/tables/__init__.py +++ b/docs/source/tables/__init__.py @@ -1,2 +1,4 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 + +"""Table helpers for composer docs.""" diff --git a/docs/source/tables/utils.py b/docs/source/tables/utils.py index a8eee76d03..8b3bfa37ad 100644 --- a/docs/source/tables/utils.py +++ b/docs/source/tables/utils.py @@ -10,6 +10,7 @@ def list_dirs(folder): """Lists all dirs for a given folder. + Args: folder (str): The folder to list dirs for. """ @@ -19,12 +20,12 @@ def list_dirs(folder): def assert_attributes_exist(name, module_dict, attributes): """Assert that module has the provided attributes. + Args: name (str): The class name. module_dict (dict): The dict form of the class. attributes (list): The attributes to check for. """ - for attribute in attributes: assert attribute in module_dict, \ f"{name} should define {attribute} in its __init__.py file." @@ -32,11 +33,14 @@ def assert_attributes_exist(name, module_dict, attributes): def get_metadata(names, attributes, module_basepath): """Returns a nested dict of metadata with names as keys. + Checks that all attributes exist in module given by module_basepath.name. + Args: names (str): The module names. attributes (list): The attributes to check for. module_basepath (str): The import path of the module. + Example:: >>> get_metadata( names=['blurpool', 'label_smoothing'], @@ -78,13 +82,16 @@ def get_metadata(names, attributes, module_basepath): def build_markdown_table(header, metadata, sorted_keys, row_format): """Builds a markdown table, formatting `row_format` with the `metadata`. + Entries in the table are ordered by `sorted_keys`. + Args: header (list): list of header strings metadata (dict): nested dict of metadata sorted_keys (list): order of rows in table row_format (list): list of length(header). Elements are either a string - or a single-argument callable that returns a string. + or a single-argument callable that returns a string. + Returns: table_md (list): table in markdown format """ @@ -109,6 +116,7 @@ def _print_row(row): def index_tag_in_lines(lines, tag): """Returns line number where tag is found. + Args: lines (list): List of lines to check. tag (str): Tag to find. @@ -121,10 +129,12 @@ def index_tag_in_lines(lines, tag): def update_table_in_file(table, source_file): """Updates the table content based on a source file. + Given a `source file`, updates the table. Searches the file for 'Table Start' and 'Table End' tags, and replaces the content between those tags. The original file is retained with the `.bkp` suffix. + Args: table (list): list of strings source_file (path): path to source file diff --git a/docs/source/trainer/distributed_training.rst b/docs/source/trainer/distributed_training.rst index c39e7a53a1..107a719a24 100644 --- a/docs/source/trainer/distributed_training.rst +++ b/docs/source/trainer/distributed_training.rst @@ -48,7 +48,7 @@ For additional configurations of our launcher script, run ``composer --help``. .. argparse:: :module: composer.cli.launcher - :func: get_parser + :func: _get_parser :prog: composer :nodescription: diff --git a/docs/source/trainer/using_the_trainer.rst b/docs/source/trainer/using_the_trainer.rst index b66506a5bd..17f06f91f8 100644 --- a/docs/source/trainer/using_the_trainer.rst +++ b/docs/source/trainer/using_the_trainer.rst @@ -187,10 +187,6 @@ argument. # points of the training loop trainer.fit() -.. testcleanup:: - - trainer.engine.close() - We handle inserting algorithms into the training loop and in the right order. .. seealso:: diff --git a/examples/checkpoint_with_wandb.py b/examples/checkpoint_with_wandb.py index de3ca0dc40..9c864e75c9 100644 --- a/examples/checkpoint_with_wandb.py +++ b/examples/checkpoint_with_wandb.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Save and Load Checkpoints with `Weights and Biases `.""" + import shutil import torch.utils.data diff --git a/examples/custom_models.py b/examples/custom_models.py index 4eec03b4f6..3f64fc45a2 100644 --- a/examples/custom_models.py +++ b/examples/custom_models.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Example for training with an algorithm on a custom model.""" + import torch import torch.utils.data from torchvision import datasets, transforms @@ -13,6 +15,7 @@ # Your custom model class SimpleModel(composer.models.ComposerClassifier): + """Your custom model.""" def __init__(self, num_hidden: int, num_classes: int): module = torch.nn.Sequential( diff --git a/examples/profiler_demo.py b/examples/profiler_demo.py index 3c0909904a..229ddf090c 100644 --- a/examples/profiler_demo.py +++ b/examples/profiler_demo.py @@ -1,6 +1,11 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Profiling Example. + +For a walk-through of this example, please see the `profiling guide`_. +""" + # [imports-start] import torch from torch.utils.data import DataLoader diff --git a/examples/run_composer_trainer.py b/examples/run_composer_trainer.py index 7cf06b70cb..00a13b68f1 100644 --- a/examples/run_composer_trainer.py +++ b/examples/run_composer_trainer.py @@ -23,13 +23,13 @@ from composer.utils import dist -def warning_on_one_line(message: str, category: Type[Warning], filename: str, lineno: int, file=None, line=None): +def _warning_on_one_line(message: str, category: Type[Warning], filename: str, lineno: int, file=None, line=None): # From https://stackoverflow.com/questions/26430861/make-pythons-warnings-warn-not-mention-itself return f'{category.__name__}: {message} (source: {filename}:{lineno})\n' -def main() -> None: - warnings.formatwarning = warning_on_one_line +def _main() -> None: + warnings.formatwarning = _warning_on_one_line if len(sys.argv) == 1: sys.argv = [sys.argv[0], "--help"] @@ -67,4 +67,4 @@ def main() -> None: if __name__ == "__main__": - main() + _main() diff --git a/pyproject.toml b/pyproject.toml index cc72c0ab83..c4b039ba7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1013,3 +1013,8 @@ min-public-methods=2 # Exceptions that will emit a warning when being caught. Defaults to # "BaseException, Exception". overgeneral-exceptions="BaseException,Exception" + +[tool.pydocstyle] +convention="google" +add_ignore="D102,D105,D107,D401" +add_select="D400,D404" diff --git a/scripts/ffcv/create_ffcv_datasets.py b/scripts/ffcv/create_ffcv_datasets.py index be916774ff..4fb73561b4 100644 --- a/scripts/ffcv/create_ffcv_datasets.py +++ b/scripts/ffcv/create_ffcv_datasets.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Helper utilities to create FFCV datasets.""" + import logging import os import sys @@ -15,7 +17,7 @@ log = logging.getLogger(__name__) -def get_parser(): +def _get_parser(): parser = ArgumentParser(description="Utility for converting datasets to ffcv format.") parser.add_argument("--dataset", @@ -81,8 +83,8 @@ def get_parser(): return parser -def parse_args(): - parser = get_parser() +def _parse_args(): + parser = _get_parser() args = parser.parse_args() @@ -100,9 +102,9 @@ def parse_args(): return args -def main(): +def _main(): - args = parse_args() + args = _parse_args() ds = None remote_location = None @@ -131,4 +133,4 @@ def main(): if __name__ == '__main__': - sys.exit(main()) + sys.exit(_main()) diff --git a/scripts/mds/ade20k.py b/scripts/mds/ade20k.py index aee8ee3d6a..e963c14d81 100644 --- a/scripts/mds/ade20k.py +++ b/scripts/mds/ade20k.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""ADE20K streaming dataset conversion scripts.""" + import os import random from argparse import ArgumentParser, Namespace @@ -31,7 +33,6 @@ def get(in_root: str, split: str, shuffle: bool) -> List[Tuple[str, str, str]]: Returns: List of samples of (uid, image_filename, annotation_filename). """ - # Get uids image_glob_pattern = f'{in_root}/images/{split}/ADE_{split}_*.jpg' images = sorted(glob(image_glob_pattern)) diff --git a/scripts/mds/coco.py b/scripts/mds/coco.py index 78402125db..ac7f5652d6 100644 --- a/scripts/mds/coco.py +++ b/scripts/mds/coco.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Coco streaming dataset conversion scripts.""" + import os from argparse import ArgumentParser, Namespace from typing import Dict, Iterable diff --git a/scripts/mds/imagenet1k.py b/scripts/mds/imagenet1k.py index 08abea327a..51a0421d35 100644 --- a/scripts/mds/imagenet1k.py +++ b/scripts/mds/imagenet1k.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""ImageNet 1K Streaming Dataset Conversion Script.""" + import os import random from argparse import ArgumentParser, Namespace @@ -70,12 +72,11 @@ def each(pairs: List[Tuple[str, int]]) -> Iterable[Dict[str, Any]]: def main(args: Namespace) -> None: - """Main: create ImageNet1k streaming dataset. + """Create ImageNet1k streaming dataset. Args: args (Namespace): Commandline arguments. """ - fields = ['uid', 'x', 'y'] for (split, expected_num_samples, shuffle) in [ diff --git a/setup.py b/setup.py index 647eb70731..b00932bdef 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +"""Composer package setup.""" + import os import site import sys @@ -17,6 +19,7 @@ # From https://stackoverflow.com/questions/51292333/how-to-tell-from-setup-py-if-the-module-is-being-installed-in-editable-mode class develop(develop_orig): + """Override the ``develop`` class to error if attempting an editable install as root.""" def run(self): if _IS_ROOT and (not _IS_VIRTUALENV) and (not _IS_USER): @@ -34,6 +37,7 @@ def run(self): def package_files(prefix: str, directory: str, extension: str): + """Get all the files to package.""" # from https://stackoverflow.com/a/36693250 paths = [] for (path, _, filenames) in os.walk(os.path.join(prefix, directory)): diff --git a/tests/algorithms/test_colout.py b/tests/algorithms/test_colout.py index b1bb7c9f3b..3ac9e73136 100644 --- a/tests/algorithms/test_colout.py +++ b/tests/algorithms/test_colout.py @@ -257,7 +257,7 @@ def test_match_incorrect(self, event: Event, colout_algorithm: ColOut, minimal_s @pytest.mark.parametrize("batch", [True]) def test_apply_batch(self, fake_image_batch: torch.Tensor, colout_algorithm: ColOut, minimal_state: State, empty_logger: Logger): - """Apply the algorithm to a fake batch.""" + """Applies the algorithm to a fake batch.""" p_row = colout_algorithm.p_row p_col = colout_algorithm.p_col @@ -270,7 +270,7 @@ def test_apply_batch(self, fake_image_batch: torch.Tensor, colout_algorithm: Col @pytest.mark.parametrize("batch", [True]) def test_apply_batch_pair(self, fake_image_batch: torch.Tensor, colout_algorithm: ColOut, minimal_state: State, empty_logger: Logger): - """Apply batch ColOut to 2-tuple of images.""" + """Applies batch ColOut to 2-tuple of images.""" p_row = colout_algorithm.p_row p_col = colout_algorithm.p_col diff --git a/tests/algorithms/test_progressive_resizing.py b/tests/algorithms/test_progressive_resizing.py index 4eace53d3d..36a5c0dc82 100644 --- a/tests/algorithms/test_progressive_resizing.py +++ b/tests/algorithms/test_progressive_resizing.py @@ -105,26 +105,22 @@ def test_without_target(self, X, y): @pytest.mark.parametrize("Wx,Hx", [(31, 31), (32, 32), (32, 16)]) def test_resize_batch_shape(self, X: torch.Tensor, y: torch.Tensor, mode: str, scale_factor: float): """Test scaling works for different input shapes.""" - Xc, _ = resize_batch(X, y, scale_factor, mode, resize_targets=False) assert check_scaled_shape(X, Xc, scale_factor) def test_resize_outputs_shape(self, X: torch.Tensor, y: torch.Tensor, mode: str, scale_factor: float): """Test that resizing outputs works.""" - _, yc = resize_batch(X, y, scale_factor, mode, resize_targets=True) assert check_scaled_shape(y, yc, scale_factor) def test_resize_outputs_crop(self, X: torch.Tensor, scale_factor: float): """Test that resizing outputs in crop mode gives the right targets.""" - xc, yc = resize_batch(X, X, scale_factor, "crop", resize_targets=True) assert torch.equal(xc, yc) @pytest.mark.parametrize("Wx,Hx,Wy,Hy", [(32, 32, 16, 16)]) def test_resize_outputs_different_shape(self, X, y, scale_factor: float, mode: str): """Test that resizing works when X and y have different shapes.""" - _, yc = resize_batch(X, y, scale_factor, mode, resize_targets=True) assert check_scaled_shape(y, yc, scale_factor) diff --git a/tests/loggers/test_file_logger.py b/tests/loggers/test_file_logger.py index 2fa566fe3d..0175de930a 100644 --- a/tests/loggers/test_file_logger.py +++ b/tests/loggers/test_file_logger.py @@ -27,7 +27,7 @@ def log_file_artifact(self, state: State, log_level: LogLevel, artifact_name: st @pytest.mark.parametrize("log_level", [LogLevel.EPOCH, LogLevel.BATCH]) -@pytest.mark.timeout(10) +@pytest.mark.timeout(30) def test_file_logger(dummy_state: State, log_level: LogLevel, tmp_path: pathlib.Path): log_file_name = os.path.join(tmp_path, "output.log") log_destination = FileLogger( diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index 9d3d8ad235..2ee22aa239 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -531,6 +531,7 @@ def test_checkpoint( second_trainer_hparams = copy.deepcopy(composer_trainer_hparams) first_trainer = _test_checkpoint_trainer(composer_trainer_hparams) + dist.barrier() # Ensure all ranks wrote the checkpoint file save_interval_time = Time.from_timestring(save_interval) if save_interval_time.unit == TimeUnit.EPOCH: expected_num_checkpoints = ((num_epochs - 1) // save_interval_time.value) + 1 @@ -575,6 +576,7 @@ def test_checkpoint( second_trainer_hparams.load_strict_model_weights = False _test_checkpoint_trainer(second_trainer_hparams) + dist.barrier() # Ensure all ranks wrote the checkpoint file second_trainer_final_checkpoint_filepath = os.path.join(checkpoint_b_folder, final_checkpoint) assert_checkpoints_equivalent(