diff --git a/.ci/test_lint_doctests.py b/.ci/test_lint_doctests.py
index 11f425fdf9..e6dbfdd58d 100644
--- a/.ci/test_lint_doctests.py
+++ b/.ci/test_lint_doctests.py
@@ -58,7 +58,7 @@ def test_run_doctests():
 
 @pytest.mark.timeout(30)
 def test_docker_build_matrix():
-    """Test that the docker build matrix is up to date"""
+    """Test that the docker build matrix is up to date."""
     docker_folder = pathlib.Path(os.path.dirname(__file__)) / '..' / 'docker'
 
     # Capture the existing readme and build matrix contents
@@ -86,7 +86,7 @@ def test_docker_build_matrix():
 
 @pytest.mark.parametrize("example", [1, 2])
 def test_release_tests_reflect_readme(example: int):
-    """Test that example_1.py and example_2.py in release_tests reflect the README.md"""
+    """Test that example_1.py and example_2.py in release_tests reflect the README.md."""
     with open(pathlib.Path(os.path.dirname(__file__)) / '..' / 'README.md', 'r') as f:
         readme_lines = f.readlines()
     example_code_lines = []
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2658217054..aa3b9d36e5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -36,16 +36,17 @@ repos:
         pass_filenames: false
         args: [--warnings]
         additional_dependencies: ["pyright@1.1.247"]
-  # -   repo: https://github.com/PyCQA/pydocstyle
-  #     hooks:
-  #     -   id: pydocstyle
-  #         name: pydocstyle
-  #         entry: pydocstyle
-  #         language: python
-  #         types: [python]
-  #         additional_dependencies:
-  #           - "toml"
-  #     rev: 6.1.1
+  -   repo: https://github.com/PyCQA/pydocstyle
+      hooks:
+      -   id: pydocstyle
+          name: pydocstyle
+          entry: pydocstyle
+          language: python
+          types: [python]
+          exclude: '(?:tests|.ci|composer\/algorithms|composer\/datasets|composer\/models)\/.*'
+          additional_dependencies:
+            - "toml"
+      rev: 6.1.1
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.1.0
     hooks:
@@ -72,13 +73,11 @@ repos:
       - id: check-yaml
       - id: debug-statements
       - id: destroyed-symlinks
-   #  - id: double-quote-string-fixer  # TODO(ravi): Enable this check later. Generates a large diff.
+    #  - id: double-quote-string-fixer  # TODO(ravi): Enable this check later. Generates a large diff.
       - id: end-of-file-fixer
       - id: fix-byte-order-marker
       - id: mixed-line-ending
       - id: trailing-whitespace
-    # - id: name-tests-test  # TODO(ravi): Enable this check later. Generates a large diff.
-    #   args: ['--django']
   - repo: https://github.com/Lucas-C/pre-commit-hooks
     rev: v1.1.13
     hooks:
diff --git a/composer/__init__.py b/composer/__init__.py
index 9d2d4303ae..c13042bb25 100644
--- a/composer/__init__.py
+++ b/composer/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Composer."""
+
 from composer import algorithms as algorithms
 from composer import callbacks as callbacks
 from composer import datasets as datasets
diff --git a/composer/algorithms/augmix/augmix.py b/composer/algorithms/augmix/augmix.py
index c71c572d42..041a01884a 100644
--- a/composer/algorithms/augmix/augmix.py
+++ b/composer/algorithms/augmix/augmix.py
@@ -32,13 +32,13 @@ def augmix_image(img: ImgT,
                  width: int = 3,
                  alpha: float = 1.0,
                  augmentation_set: List = augmentation_sets["all"]) -> ImgT:
-    """Applies AugMix (`Hendrycks et al, 2020 <http://arxiv.org/abs/1912.02781>`_) data
-    augmentation to a single image or batch of images. See
-    :class:`.AugMix` and the
-    :doc:`Method Card </method_cards/augmix>` for details. This function only acts on a
-    single image (or batch) per call and is unlikely to be used in a training loop. Use
-    :class:`~composer.algorithms.augmix.augmix.AugmentAndMixTransform` to use AugMix as
-    part of a :class:`torchvision.datasets.VisionDataset`\\'s ``transform``.
+    r"""Applies the AugMix (`Hendrycks et al, 2020 <http://arxiv.org/abs/1912.02781>`_) data augmentation.
+
+    This function works on a single image or batch of images. See :class:`.AugMix` and
+    the :doc:`Method Card </method_cards/augmix>` for details. This function only acts on a
+    single image (or batch) per call and is unlikely to be used in a training loop.
+    Use :class:`~composer.algorithms.augmix.augmix.AugmentAndMixTransform` to use AugMix as
+    part of a :class:`torchvision.datasets.VisionDataset`\'s ``transform``.
 
     Example:
         .. testcode::
@@ -166,7 +166,9 @@ def forward(self, img: PillowImage) -> PillowImage:
 
 
 class AugMix(Algorithm):
-    """AugMix (`Hendrycks et al, 2020 <http://arxiv.org/abs/1912.02781>`_) creates ``width`` sequences of ``depth``
+    r"""The AugMix data augmentation technique.
+
+    AugMix (`Hendrycks et al, 2020 <http://arxiv.org/abs/1912.02781>`_) creates ``width`` sequences of ``depth``
     image augmentations, applies each sequence with random intensity, and returns a convex combination of the ``width``
     augmented images and the original image.  The coefficients for mixing the augmented images are drawn from a uniform
     ``Dirichlet(alpha, alpha, ...)`` distribution. The coefficient for mixing the combined augmented image and the
@@ -224,7 +226,7 @@ class AugMix(Algorithm):
                 ``"color"``, ``"contrast"``, ``"sharpness"``, and ``"brightness"``. The
                 original implementations have an intensity sampling scheme that samples a
                 value bounded by 0.118 at a minimum, and a maximum value of
-                :math:`intensity \\times 0.18 + .1`, which ranges from 0.28 (intensity = 1)
+                :math:`intensity \times 0.18 + .1`, which ranges from 0.28 (intensity = 1)
                 to 1.9 (intensity 10). These augmentations have different effects
                 depending on whether they are < 0 or > 0 (or < 1 or > 1).
                 "all" uses implementations of "color", "contrast",
diff --git a/composer/algorithms/blurpool/blurpool_layers.py b/composer/algorithms/blurpool/blurpool_layers.py
index cfdd8d2c11..a1817ca7a8 100644
--- a/composer/algorithms/blurpool/blurpool_layers.py
+++ b/composer/algorithms/blurpool/blurpool_layers.py
@@ -31,7 +31,7 @@ def _padding_for_filt_2d_same(filt: torch.Tensor):
 
 
 def blur_2d(input: torch.Tensor, stride: _size_2_t = 1, filter: Optional[torch.Tensor] = None) -> torch.Tensor:
-    """Apply a spatial low-pass filter.
+    """Applies a spatial low-pass filter.
 
     Args:
         input (:class:`torch.Tensor`): a 4d tensor of shape NCHW
diff --git a/composer/algorithms/cutout/cutout.py b/composer/algorithms/cutout/cutout.py
index 988c7cb72a..f5058eead1 100644
--- a/composer/algorithms/cutout/cutout.py
+++ b/composer/algorithms/cutout/cutout.py
@@ -129,7 +129,7 @@ def match(self, event: Event, state: State) -> bool:
         return event == Event.AFTER_DATALOADER
 
     def apply(self, event: Event, state: State, logger: Logger) -> Optional[int]:
-        """Apply cutout on input images."""
+        """Applies cutout on input images."""
         x = state.batch_get_item(self.input_key)
         assert isinstance(x, Tensor), "Multiple tensors not supported for Cutout."
 
diff --git a/composer/algorithms/progressive_resizing/__init__.py b/composer/algorithms/progressive_resizing/__init__.py
index fb9969bb53..54f90aea97 100644
--- a/composer/algorithms/progressive_resizing/__init__.py
+++ b/composer/algorithms/progressive_resizing/__init__.py
@@ -1,7 +1,7 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""Apply Fastai's `progressive resizing <https://github.com/fastai/fastbook/blob/780b76bef3127ce5b64f8230fce60e915a
+"""Applies Fastai's `progressive resizing <https://github.com/fastai/fastbook/blob/780b76bef3127ce5b64f8230fce60e915a
 7e0735/07_sizing_and_tta.ipynb>`__ data augmentation to speed up training.
 
 Progressive resizing initially reduces input resolution to speed up early training. Throughout training, the
diff --git a/composer/algorithms/progressive_resizing/progressive_resizing.py b/composer/algorithms/progressive_resizing/progressive_resizing.py
index 49e7214307..71a45d57a2 100644
--- a/composer/algorithms/progressive_resizing/progressive_resizing.py
+++ b/composer/algorithms/progressive_resizing/progressive_resizing.py
@@ -112,7 +112,9 @@ def resize_batch(input: torch.Tensor,
 
 
 class ProgressiveResizing(Algorithm):
-    """Apply Fastai's `progressive resizing <https://\\
+    r"""Resize inputs and optionally outputs by cropping or interpolating.
+
+    Apply Fastai's `progressive resizing <https://\
     github.com/fastai/fastbook/blob/780b76bef3127ce5b64f8230fce60e915a7e0735/07_sizing_and_tta.ipynb>`__ data
     augmentation to speed up training.
 
@@ -204,6 +206,7 @@ def match(self, event: Event, state: State) -> bool:
         Args:
             event (:class:`Event`): The current event.
             state (:class:`State`): The current state.
+
         Returns:
             bool: True if this algorithm should run now
         """
@@ -271,8 +274,11 @@ def _make_crop(tensor: torch.Tensor, scale_factor: float) -> T_ResizeTransform:
 
 def _make_crop_pair(X: torch.Tensor, y: torch.Tensor,
                     scale_factor: float) -> Tuple[T_ResizeTransform, T_ResizeTransform]:
-    """Makes a pair of random crops for an input image X and target tensor y such that the same region is selected from
-    both."""
+    """Makes a pair of random crops.
+
+    Crops input image X and target tensor y such that the same region is selected from
+    both.
+    """
     # New height and width for X
     HcX = int(scale_factor * X.shape[2])
     WcX = int(scale_factor * X.shape[3])
diff --git a/composer/algorithms/squeeze_excite/squeeze_excite.py b/composer/algorithms/squeeze_excite/squeeze_excite.py
index 5c88f1989b..60647c3963 100644
--- a/composer/algorithms/squeeze_excite/squeeze_excite.py
+++ b/composer/algorithms/squeeze_excite/squeeze_excite.py
@@ -162,7 +162,7 @@ def match(self, event: Event, state: State) -> bool:
         return event == Event.INIT
 
     def apply(self, event: Event, state: State, logger: Logger) -> Optional[int]:
-        """Apply the Squeeze-and-Excitation layer replacement.
+        """Applies the Squeeze-and-Excitation layer replacement.
 
         Args:
             event (Event): the current event
diff --git a/composer/algorithms/stochastic_depth/stochastic_depth.py b/composer/algorithms/stochastic_depth/stochastic_depth.py
index 29747511fc..7e4375445b 100644
--- a/composer/algorithms/stochastic_depth/stochastic_depth.py
+++ b/composer/algorithms/stochastic_depth/stochastic_depth.py
@@ -191,7 +191,6 @@ def __init__(self,
     @property
     def find_unused_parameters(self) -> bool:
         """DDP parameter to notify that parameters may not have gradients if it is dropped during the forward pass."""
-
         return (self.stochastic_method == "block")
 
     def match(self, event: Event, state: State) -> bool:
diff --git a/composer/algorithms/swa/swa.py b/composer/algorithms/swa/swa.py
index 06b54b11c5..b2120705b0 100644
--- a/composer/algorithms/swa/swa.py
+++ b/composer/algorithms/swa/swa.py
@@ -26,7 +26,7 @@ def _assert_valid_duration(time: Time):
 
 
 class SWA(Algorithm):
-    """Apply Stochastic Weight Averaging (`Izmailov et al, 2018 <https://arxiv.org/abs/1803.05407>`_)
+    """Applies Stochastic Weight Averaging (`Izmailov et al, 2018 <https://arxiv.org/abs/1803.05407>`_)
 
     Stochastic Weight Averaging (SWA) averages model weights sampled at
     different times near the end of training. This leads to better
diff --git a/composer/algorithms/utils/augmentation_primitives.py b/composer/algorithms/utils/augmentation_primitives.py
index 80b5e345d2..226a5549d4 100644
--- a/composer/algorithms/utils/augmentation_primitives.py
+++ b/composer/algorithms/utils/augmentation_primitives.py
@@ -76,6 +76,7 @@ def _float_parameter(level: float, maxval: float):
       level (float): Level of the operation that will be between [0, 10].
       maxval (float): Maximum value that the operation can have. This will be scaled to
           level/10.
+
     Returns:
       float: The result from scaling ``maxval`` according to ``level``.
     """
@@ -88,8 +89,10 @@ def _sample_level(n: float):
 
 
 def _symmetric_sample(level: float):
-    """Helper function to sample from a distribution over the domain [0.1, 10] with median == 1 and uniform probability
-    of x | 0.1 ≤ x ≤ 1, and x | 1 ≤ x ≤ 10.
+    """Helper function to sample from a symmetric distribution.
+
+    The distribution over the domain [0.1, 10] with median == 1 and uniform probability of x | 0.1 ≤ x ≤ 1,
+    and x | 1 ≤ x ≤ 10.
 
     Used for sampling transforms that can range from intensity 0 to infinity, and for which an intensity of 1 == no
     change.
@@ -106,7 +109,8 @@ def autocontrast(pil_img: Image.Image, level: float = 0.0):
     .. seealso:: :func:`PIL.ImageOps.autocontrast`.
 
     Args:
-        pil_img (Image.Image): The image
+        pil_img (Image.Image): The image.
+        level (float): The intensity.
     """
     del level  # unused
     return ImageOps.autocontrast(pil_img)
@@ -118,7 +122,8 @@ def equalize(pil_img: Image.Image, level: float):
     .. seealso:: :func:`PIL.ImageOps.equalize`.
 
     Args:
-        pil_img (Image.Image): The image
+        pil_img (Image.Image): The image.
+        level (float): The intensity.
     """
     del level  # unused
     return ImageOps.equalize(pil_img)
diff --git a/composer/callbacks/callback_hparams_registry.py b/composer/callbacks/callback_hparams_registry.py
index 83eee2eebe..77c3fe26e7 100644
--- a/composer/callbacks/callback_hparams_registry.py
+++ b/composer/callbacks/callback_hparams_registry.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Hyperparameter registry for callbacks."""
+
 from typing import Dict, Type, Union
 
 import yahp as hp
diff --git a/composer/callbacks/checkpoint_saver.py b/composer/callbacks/checkpoint_saver.py
index 807135a485..1047d98d99 100644
--- a/composer/callbacks/checkpoint_saver.py
+++ b/composer/callbacks/checkpoint_saver.py
@@ -27,11 +27,11 @@
 
 
 def checkpoint_periodically(interval: Union[str, int, Time]) -> Callable[[State, Event], bool]:
-    """Helper function to create a checkpoint scheduler according to a specified interval.
+    r"""Helper function to create a checkpoint scheduler according to a specified interval.
 
     Args:
         interval (Union[str, int, :class:`.Time`]): The interval describing how often checkpoints should be
-            saved. If an integer, it will be assumed to be in :attr:`.TimeUnit.EPOCH`\\s.
+            saved. If an integer, it will be assumed to be in :attr:`.TimeUnit.EPOCH`\s.
             Otherwise, the unit must be either :attr:`.TimeUnit.EPOCH` or :attr:`.TimeUnit.BATCH`.
 
             Checkpoints will be saved every ``n`` batches or epochs (depending on the unit),
@@ -85,7 +85,7 @@ def save_interval(state: State, event: Event):
     return save_interval
 
 
-class CheckpointSaver(Callback):
+class CheckpointSaver(Callback):  # noqa: D101
     __doc__ = f"""Callback to save checkpoints.
 
     .. note::
@@ -114,10 +114,6 @@ class CheckpointSaver(Callback):
         ...     )
         ... ])
 
-    .. testcleanup::
-
-        trainer.engine.close()
-
     Args:
         folder (str, optional): Format string for the folder where checkpoints will be saved.
             Default: ``'{{run_name}}/checkpoints'``.
diff --git a/composer/callbacks/early_stopper.py b/composer/callbacks/early_stopper.py
index abce6c82d1..c55b7eeff8 100644
--- a/composer/callbacks/early_stopper.py
+++ b/composer/callbacks/early_stopper.py
@@ -21,11 +21,9 @@
 
 
 class EarlyStopper(Callback):
-    """This callback tracks a training or evaluation metric and halts training if the metric does not
-    improve within a given interval.
-
-    Example
+    """Halt training if a metric does not improve within a given interval.
 
+    Example:
     .. doctest::
 
         >>> from composer.callbacks.early_stopper import EarlyStopper
diff --git a/composer/callbacks/grad_monitor.py b/composer/callbacks/grad_monitor.py
index 5fbb06f9bb..d52b1a7d49 100644
--- a/composer/callbacks/grad_monitor.py
+++ b/composer/callbacks/grad_monitor.py
@@ -17,8 +17,7 @@ class GradMonitor(Callback):
     the model and hence may cause a reduction in throughput while training large models. In order to ensure the
     correctness of norm, this function should be called after gradient unscaling in cases where gradients are scaled.
 
-    Example
-
+    Example:
     .. doctest::
 
         >>> from composer.callbacks import GradMonitor
@@ -32,10 +31,6 @@ class GradMonitor(Callback):
         ...     callbacks=[GradMonitor()],
         ... )
 
-    .. testcleanup::
-
-        trainer.engine.close()
-
     The L2 norms are logged by the :class:`~composer.loggers.logger.Logger` to the following keys as described below.
 
     +-----------------------------------+-------------------------------------------------------------+
@@ -57,7 +52,6 @@ class GradMonitor(Callback):
     """
 
     def __init__(self, log_layer_grad_norms: bool = False):
-        super().__init__()
         self.log_layer_grad_norms = log_layer_grad_norms
 
     def after_train_batch(self, state: State, logger: Logger):
diff --git a/composer/callbacks/lr_monitor.py b/composer/callbacks/lr_monitor.py
index df25b06a38..f8f2d88732 100644
--- a/composer/callbacks/lr_monitor.py
+++ b/composer/callbacks/lr_monitor.py
@@ -14,8 +14,7 @@ class LRMonitor(Callback):
     This callback iterates over all optimizers and their parameter groups to log learning rate under the
     ``lr-{OPTIMIZER_NAME}/group{GROUP_NUMBER}`` key.
 
-    Example
-
+    Example:
     .. doctest::
 
         >>> from composer.callbacks import LRMonitor
@@ -29,10 +28,6 @@ class LRMonitor(Callback):
         ...     callbacks=[LRMonitor()],
         ... )
 
-    .. testcleanup::
-
-        trainer.engine.close()
-
     The learning rate is logged by the :class:`~composer.loggers.logger.Logger` to the following key as described
     below.
 
@@ -46,7 +41,7 @@ class LRMonitor(Callback):
     """
 
     def __init__(self) -> None:
-        super().__init__()
+        pass
 
     def batch_end(self, state: State, logger: Logger):
         assert state.optimizers is not None, "optimizers must be defined"
diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index 6774def8dc..8a89f564cf 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -23,8 +23,7 @@ class MemoryMonitor(Callback):
     This callback calls the torch memory stats API for cuda (see :func:`torch.cuda.memory_stats`) on the
     :attr:`~composer.core.event.Event.AFTER_TRAIN_BATCH` and reports different memory statistics.
 
-    Example
-
+    Example:
     .. doctest::
 
         >>> from composer.callbacks import MemoryMonitor
@@ -38,10 +37,6 @@ class MemoryMonitor(Callback):
         ...     callbacks=[MemoryMonitor()],
         ... )
 
-    .. testcleanup::
-
-        trainer.engine.close()
-
     The memory statistics are logged by the :class:`~composer.loggers.logger.Logger` to the following keys as
     described below.
 
diff --git a/composer/callbacks/mlperf.py b/composer/callbacks/mlperf.py
index 1746909876..655ed5734e 100644
--- a/composer/callbacks/mlperf.py
+++ b/composer/callbacks/mlperf.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Create compliant results file for MLPerf Training benchmark."""
+
 import json
 import logging
 import os
@@ -44,14 +46,14 @@ def _local_rank_zero() -> bool:
     return dist.get_local_rank() == 0
 
 
-def require_mlperf_logging():
+def _require_mlperf_logging():
     if not mlperf_available:
         raise ImportError("""Please install with `pip install 'mosaicml[mlperf]'` and also
                           install the logging library from: https://github.com/mlcommons/logging""")
 
 
 class MLPerfCallback(Callback):
-    """Creates a compliant results file for MLPerf Training benchmark.
+    """Create compliant results file for MLPerf Training benchmark.
 
     A submission folder structure will be created with the ``root_folder``
     as the base and the following directories::
@@ -73,18 +75,17 @@ class MLPerfCallback(Callback):
     Currently, only open division submissions are supported with this Callback.
 
     Example:
+        .. code-block:: python
 
-    .. code-block:: python
-
-        from composer.callbacks import MLPerfCallback
+            from composer.callbacks import MLPerfCallback
 
-        callback = MLPerfCallback(
-            root_folder='/submission',
-            index=0,
-            metric_name='Accuracy',
-            metric_label='eval',
-            target='0.759',
-        )
+            callback = MLPerfCallback(
+                root_folder='/submission',
+                index=0,
+                metric_name='Accuracy',
+                metric_label='eval',
+                target='0.759',
+            )
 
     During training, the metric found in ``state.current_metrics[metric_label][metric_name]``
     will be compared against the target criterion.
@@ -138,7 +139,7 @@ def __init__(
         host_processors_per_node: Optional[int] = None,
     ) -> None:
 
-        require_mlperf_logging()
+        _require_mlperf_logging()
 
         if benchmark not in BENCHMARKS:
             raise ValueError(f"benchmark: {benchmark} must be one of {BENCHMARKS}")
@@ -176,7 +177,6 @@ def __init__(
         self.success = False
 
     def init(self, state: State, logger: Logger) -> None:
-
         # setup here requies access to rank, which is only available after
         # the trainer is initialized
         if _local_rank_zero():
@@ -242,8 +242,7 @@ def _get_accuracy(self, state: State) -> float:
         return float(metric)
 
     def _get_dataloader_stats(self, dataloader: Iterable):
-        """ returns tuple of (batch_size, num_samples)"""
-
+        """Returns a tuple of ``(batch_size, num_samples)``."""
         if isinstance(dataloader, DataLoader):
             return (dataloader.batch_size, len(dataloader.dataset))  # type: ignore
         try:
diff --git a/composer/callbacks/speed_monitor.py b/composer/callbacks/speed_monitor.py
index fb6a7f7772..2ef68dca2f 100644
--- a/composer/callbacks/speed_monitor.py
+++ b/composer/callbacks/speed_monitor.py
@@ -23,8 +23,7 @@ class SpeedMonitor(Callback):
     average throughput and wall clock train, validation, and total time is also logged on
     the :attr:`~composer.core.event.Event.EPOCH_END` event.
 
-    Example
-
+    Example:
     .. doctest::
 
         >>> from composer.callbacks import SpeedMonitor
@@ -38,10 +37,6 @@ class SpeedMonitor(Callback):
         ...     callbacks=[SpeedMonitor(window_size=100)],
         ... )
 
-    .. testcleanup::
-
-        trainer.engine.close()
-
     The training throughput is logged by the :class:`~composer.loggers.logger.Logger` to the following keys as
     described below.
 
@@ -90,13 +85,6 @@ def __init__(self, window_size: int = 100):
         self.loaded_state: Optional[Dict[str, Any]] = None
 
     def state_dict(self) -> Dict[str, Any]:
-        """Returns a dictionary representing the internal state of the SpeedMonitor object.
-
-        The returned dictionary is pickle-able via :func:`torch.save`.
-
-        Returns:
-            Dict[str, Any]: The state of the SpeedMonitor object
-        """
         current_time = time.time()
         return {
             "train_examples_per_epoch": self.train_examples_per_epoch,
@@ -109,12 +97,6 @@ def state_dict(self) -> Dict[str, Any]:
         }
 
     def load_state_dict(self, state: Dict[str, Any]) -> None:
-        """Restores the state of SpeedMonitor object.
-
-        Args:
-            state (Dict[str, Any]): The state of the object,
-                as previously returned by :meth:`.state_dict`
-        """
         self.loaded_state = state
 
     def _load_state(self) -> None:
diff --git a/composer/callbacks/threshold_stopper.py b/composer/callbacks/threshold_stopper.py
index 49803487d1..9c267094fe 100644
--- a/composer/callbacks/threshold_stopper.py
+++ b/composer/callbacks/threshold_stopper.py
@@ -13,11 +13,9 @@
 
 
 class ThresholdStopper(Callback):
-    """This callback tracks a training or evaluation metric and halts training when the
-    metric value reaches a certain threshold.
-
-    Example
+    """Halt training when a metric value reaches a certain threshold.
 
+    Example:
     .. doctest::
 
         >>> from composer.callbacks.threshold_stopper import ThresholdStopper
diff --git a/composer/cli/__init__.py b/composer/cli/__init__.py
index cba5725129..0eb6811a8e 100644
--- a/composer/cli/__init__.py
+++ b/composer/cli/__init__.py
@@ -1,2 +1,13 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
+
+"""Composer CLI."""
+
+import sys
+
+from composer.cli.launcher import main
+
+__all__ = ["main"]
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/composer/cli/launcher.py b/composer/cli/launcher.py
index e82a21ca7a..71944bb984 100644
--- a/composer/cli/launcher.py
+++ b/composer/cli/launcher.py
@@ -1,5 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
+
+"""The Composer CLI launcher for distributed training."""
+
 import datetime
 import logging
 import os
@@ -21,7 +24,7 @@
 log = logging.getLogger(__name__)
 
 
-def get_parser():
+def _get_parser():
     parser = ArgumentParser(description="Utility for launching distributed machine learning jobs.")
 
     required_args = parser.add_argument_group("required arguments")
@@ -120,7 +123,7 @@ def _get_free_tcp_port() -> int:
 
 
 def _parse_args():
-    parser = get_parser()
+    parser = _get_parser()
 
     args = parser.parse_args()
 
@@ -399,6 +402,7 @@ def _aggregate_process_returncode(processes: Dict[int, subprocess.Popen]) -> int
 
 
 def main():
+    """Entrypoint into the Composer CLI."""
     args = _parse_args()
 
     logging.basicConfig()
diff --git a/composer/core/algorithm.py b/composer/core/algorithm.py
index ce086e6225..763c2c1da9 100644
--- a/composer/core/algorithm.py
+++ b/composer/core/algorithm.py
@@ -42,8 +42,7 @@ def __init__(self, *args, **kwargs):  # Stub signature for PyRight
 
     @property
     def find_unused_parameters(self) -> bool:
-        """Return True to indicate that the effect of this algorithm may cause some model parameters to be unused.
-        Defaults to False.
+        """Indicates whether this algorithm may cause some model parameters to be unused. Defaults to False.
 
         For example, it is used to tell :class:`torch.nn.parallel.DistributedDataParallel` (DDP) that some parameters
         will be frozen during training and hence it should not expect gradients from them. All algorithms which do any
@@ -66,11 +65,9 @@ def backwards_create_graph(self) -> bool:
 
     @abstractmethod
     def match(self, event: Event, state: State) -> bool:
-        """Determines whether this algorithm should run given the current :class:`~.event.Event` and
-        :class:`~.state.State`.
+        """Determines whether this algorithm should run given the current :class:`~.event.Event` and :class:`~.state.State`.
 
         Examples:
-
         To only run on a specific event (e.g., on :attr:`~.Event.BEFORE_LOSS`), override match as shown below:
 
         >>> class MyAlgorithm:
@@ -92,6 +89,7 @@ def match(self, event: Event, state: State) -> bool:
         Args:
             event (Event): The current event.
             state (State): The current state.
+
         Returns:
             bool: True if this algorithm should run now.
         """
@@ -108,6 +106,7 @@ def apply(self, event: Event, state: State, logger: Logger) -> Optional[int]:
             event (Event): The current event.
             state (State): The current state.
             logger (Logger): A logger to use for logging algorithm-specific metrics.
+
         Returns:
             int or None: exit code that will be stored in :class:`~.engine.Trace` and made accessible for debugging.
         """
diff --git a/composer/core/callback.py b/composer/core/callback.py
index 30052c3a74..66e0244016 100644
--- a/composer/core/callback.py
+++ b/composer/core/callback.py
@@ -50,10 +50,6 @@ class Callback(Serializable, abc.ABC):
             >>> _ = trainer.engine.run_event(Event.EPOCH_START)
             Epoch: 0
 
-        .. testcleanup::
-
-            trainer.engine.close()
-
     #.  Override :meth:`run_event` if you want a single method to handle all events.  If this method is overridden, then
         the individual methods corresponding to each event name (such as :meth:`epoch_start`) will no longer be
         automatically invoked. For example, if you override :meth:`run_event` then :meth:`epoch_start` will not be called
@@ -82,10 +78,6 @@ class Callback(Serializable, abc.ABC):
             >>> # is triggered, like this:
             >>> _ = trainer.engine.run_event(Event.EPOCH_START)
             Epoch: 0
-
-        .. testcleanup::
-
-            trainer.engine.close()
     """
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
@@ -94,7 +86,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         pass
 
     def run_event(self, event: Event, state: State, logger: Logger) -> None:
-        """This method is called by the engine on each event.
+        """Called by the engine on each event.
 
         Args:
             event (Event): The event.
@@ -441,10 +433,9 @@ def close(self, state: State, logger: Logger) -> None:
         pass
 
     def post_close(self) -> None:
-        """This hook is called after :meth:`close` has been invoked for each callback. Very few callbacks should need to
-        implement :meth:`post_close`.
+        """Called after :meth:`close` has been invoked for each callback.
 
-        This callback can be used to back up any data that may have been written by other callbacks during
-        :meth:`close`.
+        Very few callbacks should need to implement :meth:`post_close`. This callback can be used to back up any data
+        that may have been written by other callbacks during :meth:`close`.
         """
         pass
diff --git a/composer/core/data_spec.py b/composer/core/data_spec.py
index 6a649bec0f..cae177002a 100644
--- a/composer/core/data_spec.py
+++ b/composer/core/data_spec.py
@@ -49,8 +49,9 @@ def _split_mapping(m, num_microbatches: int):
 
 
 def _default_split_batch(batch: Any, num_microbatches: int) -> Sequence:
-    """Splits batch into `num_microbatches` chunks for gradient accumulation. Works with tensors, dictionaries of
-    tensors, (x, y) tuples, and lists where batch is the 2nd dimension.
+    """Splits batch into `num_microbatches` chunks for gradient accumulation.
+
+    Works with tensors, dictionaries of tensors, (x, y) tuples, and lists where batch is the 2nd dimension.
 
     Args:
         batch: output from the dataloader.
@@ -110,10 +111,6 @@ class DataSpec:
        ...     max_duration="1ep",
        ... )
 
-    .. testcleanup::
-
-        trainer.engine.close()
-
     Args:
         dataloader (Iterable): The dataloader, which can be any iterable that yields batches.
 
@@ -216,7 +213,7 @@ def _default_get_num_tokens_in_batch(self, batch: Batch) -> int:
 
 
 def ensure_data_spec(dataloader: Union[DataSpec, Iterable, dict]) -> DataSpec:
-    """Ensures that the ``dataloader`` is a :class:`.DataSpec`
+    """Ensures that the ``dataloader`` is a :class:`.DataSpec`.
 
     Args:
         dataloader (DataSpec | Iterable | dict): A DataSpec, DataLoader, or Dict of DataSpec kwargs.
diff --git a/composer/core/engine.py b/composer/core/engine.py
index dc4909e30c..d96392e63d 100644
--- a/composer/core/engine.py
+++ b/composer/core/engine.py
@@ -200,6 +200,7 @@ def run_event(
         Args:
             event (Event | str): The current :class:`~.event.Event`. It can be the enum member values or a
                 string with the event value.
+
         Returns:
             traces (Traces): Ordered dictionary of trace for each algorithm.
         """
diff --git a/composer/core/evaluator.py b/composer/core/evaluator.py
index bcad074c63..1879189947 100644
--- a/composer/core/evaluator.py
+++ b/composer/core/evaluator.py
@@ -80,11 +80,6 @@ class Evaluator:
        ...     max_duration="1ep",
        ... )
 
-    .. testcleanup::
-
-        trainer.engine.close()
-
-
     Args:
         label (str): Name of the Evaluator
         dataloader (DataSpec | Iterable | Dict[str, Any]): Iterable that yields batches, a :class:`.DataSpec` for evaluation,
diff --git a/composer/core/event.py b/composer/core/event.py
index a38aad5b0d..616f7e0cc1 100644
--- a/composer/core/event.py
+++ b/composer/core/event.py
@@ -1,15 +1,17 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""Events represent specific points in the training loop where an :class:`~.core.Algorithm` and :class:`~.core.Callback`
-can run."""
+"""Training Loop Events."""
 from composer.utils.string_enum import StringEnum
 
 __all__ = ["Event"]
 
 
 class Event(StringEnum):
-    """Enum to represent events in the training loop.
+    """Enum to represent training loop events.
+
+    Events mark specific points in the training loop where an :class:`~.core.Algorithm` and :class:`~.core.Callback`
+    can run.
 
     The following pseudocode shows where each event fires in the training loop:
 
@@ -167,14 +169,20 @@ class Event(StringEnum):
 
     @property
     def is_before_event(self) -> bool:
-        """Whether the event is a 'before_*' event (e.g., :attr:`~Event.BEFORE_LOSS`) and has a corresponding 'after_*'
-        (.e.g., :attr:`~Event.AFTER_LOSS`)."""
+        """Whether the event is an "before" event.
+
+        An "before" event (e.g., :attr:`~Event.BEFORE_LOSS`) has a corresponding "after" event
+        (.e.g., :attr:`~Event.AFTER_LOSS`).
+        """
         return self in _BEFORE_EVENTS
 
     @property
     def is_after_event(self) -> bool:
-        """Whether the event is an 'after_*' event (e.g., :attr:`~Event.AFTER_LOSS`) and has a corresponding 'before_*'
-        (.e.g., :attr:`~Event.BEFORE_LOSS`)."""
+        """Whether the event is an "after" event.
+
+        An "after" event (e.g., :attr:`~Event.AFTER_LOSS`) has a corresponding "before" event
+        (.e.g., :attr:`~Event.BEFORE_LOSS`).
+        """
         return self in _AFTER_EVENTS
 
     @property
diff --git a/composer/core/precision.py b/composer/core/precision.py
index aac96052e3..2ead921d16 100644
--- a/composer/core/precision.py
+++ b/composer/core/precision.py
@@ -45,7 +45,6 @@ def get_precision_context(precision: Union[str, Precision]) -> Generator[None, N
     Args:
         precision (str | Precision): Precision for the context
     """
-
     precision = Precision(precision)
     if precision == Precision.FP32:
         if torch.cuda.is_available():
diff --git a/composer/core/state.py b/composer/core/state.py
index eb7f7bc35c..907c84e95c 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -453,7 +453,6 @@ def load_state_dict(self, state: Dict[str, Any], strict: bool = False):
             strict (bool): whether the keys in the ``state["model"]`` should perfectly match the keys in the
                 ``self.model``. Defaults to False.
         """
-
         state = _ensure_backwards_compatible_checkpointing(state)
 
         for attribute_name, serialized_value in state.items():
diff --git a/composer/core/time.py b/composer/core/time.py
index f790e5cfa0..99c4d58ac1 100644
--- a/composer/core/time.py
+++ b/composer/core/time.py
@@ -55,9 +55,8 @@ class TimeUnit(StringEnum):
 TValue = TypeVar("TValue", int, float)
 
 
-class Time(Generic[TValue]):
-    """Time represents static durations of training time or points in the training process in terms of a
-    :class:`TimeUnit` enum (epochs, batches, samples, tokens, or duration).
+class Time(Generic[TValue], Serializable):
+    """Time represents static durations of training time in terms of a :class:`TimeUnit` enum.
 
     See the :doc:`Time Guide </trainer/time>` for more details on tracking time during training.
 
@@ -129,7 +128,9 @@ def __init__(
 
     @classmethod
     def from_epoch(cls, epoch: int) -> Time:
-        """Create a :class:`Time` with units of :attr:`TimeUnit.EPOCH`. Equivalent to ``Time(epoch, TimeUnit.EPOCH)``.
+        """Create a :class:`Time` with units of :attr:`TimeUnit.EPOCH`.
+
+        Equivalent to ``Time(epoch, TimeUnit.EPOCH)``.
 
         Args:
             epoch (int): Number of epochs.
@@ -141,7 +142,9 @@ def from_epoch(cls, epoch: int) -> Time:
 
     @classmethod
     def from_batch(cls, batch: int) -> Time:
-        """Create a :class:`Time` with units of :attr:`TimeUnit.BATCH`. Equivalent to ``Time(batch, TimeUnit.BATCH)``.
+        """Create a :class:`Time` with units of :attr:`TimeUnit.BATCH`.
+
+        Equivalent to ``Time(batch, TimeUnit.BATCH)``.
 
         Args:
             batch (int): Number of batches.
@@ -153,8 +156,9 @@ def from_batch(cls, batch: int) -> Time:
 
     @classmethod
     def from_sample(cls, sample: int) -> Time:
-        """Create a :class:`Time` with units of :attr:`TimeUnit.SAMPLE`. Equivalent to ``Time(sample,
-        TimeUnit.SAMPLE)``.
+        """Create a :class:`Time` with units of :attr:`TimeUnit.SAMPLE`.
+
+        Equivalent to ``Time(sample, TimeUnit.SAMPLE)``.
 
         Args:
             sample (int): Number of samples.
@@ -166,7 +170,9 @@ def from_sample(cls, sample: int) -> Time:
 
     @classmethod
     def from_token(cls, token: int) -> Time:
-        """Create a :class:`Time` with units of :attr:`TimeUnit.TOKEN`. Equivalent to ``Time(sample, TimeUnit.TOKEN)``.
+        """Create a :class:`Time` with units of :attr:`TimeUnit.TOKEN`.
+
+        Equivalent to ``Time(sample, TimeUnit.TOKEN)``.
 
         Args:
             token (int): Number of tokens.
@@ -178,8 +184,9 @@ def from_token(cls, token: int) -> Time:
 
     @classmethod
     def from_duration(cls, duration: float) -> Time:
-        """Create a :class:`Time` with units of :attr:`TimeUnit.DURATION`. Equivalent to ``Time(duration,
-        TimeUnit.DURATION)``.
+        """Create a :class:`Time` with units of :attr:`TimeUnit.DURATION`.
+
+        Equivalent to ``Time(duration, TimeUnit.DURATION)``.
 
         Args:
             duration (float): Duration of the training process. Should be on ``[0, 1)``
@@ -326,8 +333,9 @@ def __hash__(self):
 
     @classmethod
     def from_timestring(cls, timestring: str) -> Time:
-        """Parse a time string into a :class:`Time` instance. A time string is a numerical value followed by the value
-        of a :class:`TimeUnit` enum. For example:
+        """Parse a time string into a :class:`Time` instance.
+
+        A time string is a numerical value followed by the value of a :class:`TimeUnit` enum. For example:
 
         >>> Time.from_timestring("5ep")  # describes 5 epochs.
         Time(5, TimeUnit.EPOCH)
@@ -356,8 +364,10 @@ def from_timestring(cls, timestring: str) -> Time:
 
 
 class Timestamp(Serializable):
-    """Timestamp represents a snapshot of the current training progress, in terms of epochs, batches,
-    samples, and tokens. Timestamps are not updated in-place.
+    """Timestamp represents a snapshot of the current training progress.
+
+    The timestamp measures training progress in terms of epochs, batches, samples, tokens, and wall clock time.
+    Timestamps are not updated in-place.
 
     See the :doc:`Time Guide </trainer/time>` for more details on tracking time during training.
 
@@ -452,6 +462,11 @@ def state_dict(self) -> Dict[str, Any]:
         }
 
     def get_state(self) -> Dict[str, Union[Time[int], datetime.timedelta]]:
+        """Returns all values of the timestamp object in a dictionary.
+
+        Returns:
+            Dict[str, Union[Time[int], datetime.timedelta]]: All values of the timestamp object.
+        """
         return {
             "epoch": self.epoch,
             "batch": self.batch,
@@ -614,7 +629,7 @@ def to_next_batch(
         tokens: Union[int, Time] = 0,
         duration: Optional[datetime.timedelta] = None,
     ):
-        """Create a new :class:`.Timestamp`, with the batch, sample, and token counts properly incremented.
+        """Create a new :class:`.Timestamp`, advanced to the next batch.
 
         Equivalent to:
 
@@ -670,12 +685,10 @@ def to_next_batch(
         )
 
     def to_next_epoch(self):
-        """Create a new :class:`.Timestamp` incremented by one epoch and with
-        :attr:`batch_in_epoch`, :attr:`sample_in_epoch`, and :attr:`token_in_epoch` reset.
+        """Create a new :class:`.Timestamp`, advanced to the next epoch.
 
         Equivalent to:
 
-
         .. testsetup::
 
             from composer.core.time import Timestamp
@@ -718,8 +731,9 @@ def copy(
         epoch_wct: Optional[datetime.timedelta] = None,
         batch_wct: Optional[datetime.timedelta] = None,
     ) -> Timestamp:
-        """Create a copy of the timestamp. Any specified values will override the existing values in the
-        returned copy.
+        """Create a copy of the timestamp.
+
+        Any specified values will override the existing values in the returned copy.
 
         Args:
             epoch (int | Time[int], optional): The epoch.
@@ -764,7 +778,7 @@ def __repr__(self) -> str:
 
 
 def ensure_time(maybe_time: Union[Time, str, int], int_unit: Union[TimeUnit, str]) -> Time:
-    """Ensure ``maybe_time`` is an instance of :class:`.Time`
+    """Ensure ``maybe_time`` is an instance of :class:`.Time`.
 
     Args:
         maybe_time (Time | str): A time string, integer, or instance of :class:`.Time`.
diff --git a/composer/datasets/streaming/format.py b/composer/datasets/streaming/format.py
index f7454f0129..4ff3f60707 100644
--- a/composer/datasets/streaming/format.py
+++ b/composer/datasets/streaming/format.py
@@ -1,8 +1,7 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""The :class:`StreamingDatsetIndex` format that defines shard/sample metadata for :class:`StreamingDataset`.
-"""
+"""The :class:`StreamingDatsetIndex` format that defines shard/sample metadata for :class:`StreamingDataset`."""
 
 import math
 from io import BufferedIOBase, BufferedReader, BufferedWriter, BytesIO
@@ -279,7 +278,6 @@ def get_partition(self, world: World, batch_size: Optional[int] = None) -> Tuple
             min_id (int): The lowest sample ID of this partition.
             max_id (int): The highest sample ID of this partition.
         """
-
         global_device = world.global_device
         global_num_devices = world.global_num_devices
         node_worker = world.node_worker
diff --git a/composer/datasets/streaming/world.py b/composer/datasets/streaming/world.py
index 2912be5b16..4ffda61b6f 100644
--- a/composer/datasets/streaming/world.py
+++ b/composer/datasets/streaming/world.py
@@ -52,7 +52,6 @@ class World(NamedTuple):
 
 def get_world() -> World:
     """Returns a :class:`World` object, initialized using :mod:`composer.utils.dist` and :func:`torch.utils.data.get_worker_info`"""
-
     # Node and Device info
     global_node = dist.get_node_rank()
     global_device = dist.get_global_rank()
diff --git a/composer/loggers/__init__.py b/composer/loggers/__init__.py
index 1c986593e9..0a931690ff 100644
--- a/composer/loggers/__init__.py
+++ b/composer/loggers/__init__.py
@@ -4,7 +4,7 @@
 """Loggers to store metrics and artifacts.
 
 In Composer, algorithms and callbacks can make calls to the :class:`~.logger.Logger`,
-which then routes the calls to the appropriate :class:`~.logger_destination.LoggerDestination`\\s.
+which then routes the calls to the appropriate :class:`~.logger_destination.LoggerDestination` instances.
 The :class:`~.logger_destination.LoggerDestination` does the actual logging, for example to a file,
 or Weights and Biases.
 
diff --git a/composer/loggers/file_logger.py b/composer/loggers/file_logger.py
index 0ca63b1b5a..f90254997a 100644
--- a/composer/loggers/file_logger.py
+++ b/composer/loggers/file_logger.py
@@ -19,7 +19,7 @@
 __all__ = ["FileLogger"]
 
 
-class FileLogger(LoggerDestination):
+class FileLogger(LoggerDestination):  # noqa: D101
     __doc__ = f"""Log data to a file.
 
     Example usage:
diff --git a/composer/loggers/in_memory_logger.py b/composer/loggers/in_memory_logger.py
index a743ddd162..f59fd0c2e4 100644
--- a/composer/loggers/in_memory_logger.py
+++ b/composer/loggers/in_memory_logger.py
@@ -23,8 +23,9 @@
 
 
 class InMemoryLogger(LoggerDestination):
-    """Logs metrics to dictionary objects that persist in memory throughout training. Useful for collecting and plotting
-    data inside notebooks.
+    """Logs metrics to dictionary objects that persist in memory throughout training.
+
+    Useful for collecting and plotting data inside notebooks.
 
     Example usage:
         .. testcode::
@@ -46,10 +47,6 @@ class InMemoryLogger(LoggerDestination):
             # which index in trainer.logger.destinations contains your desired logger.
             logged_data = trainer.logger.destinations[0].data
 
-        .. testcleanup::
-
-            trainer.engine.close()
-
     Args:
         log_level (str | LogLevel, optional):
             :class:`~.logger.LogLevel` (i.e. unit of resolution) at
@@ -120,7 +117,6 @@ def get_timeseries(self, metric: str) -> Dict[str, Any]:
                 plt.xlabel("Batch")
                 plt.ylabel("Validation Accuracy")
         """
-
         # Check that desired metric is in present data
         if metric not in self.data.keys():
             raise ValueError(f"Invalid value for argument `metric`: {metric}. Requested "
diff --git a/composer/loggers/logger.py b/composer/loggers/logger.py
index 73e52501f3..0edabb46eb 100644
--- a/composer/loggers/logger.py
+++ b/composer/loggers/logger.py
@@ -31,6 +31,7 @@ class LogLevel(IntEnum):
 
     Logging destinations use the LogLevel to determine whether to record a given
     metric or state change.
+
     Attributes:
         FIT: Logged once per training run.
         EPOCH: Logged once per epoch.
@@ -152,7 +153,6 @@ def file_artifact(
             overwrite (bool, optional): Whether to overwrite an existing artifact with the same ``artifact_name``.
                 (default: ``False``)
         """
-
         log_level = LogLevel(log_level)
         file_path = format_name_with_dist(format_str=str(file_path), run_name=self.run_name)
         file_path = pathlib.Path(file_path)
@@ -185,7 +185,6 @@ def symlink_artifact(
             overwrite (bool, optional): Whether to overwrite an existing artifact with the same ``symlink_artifact_name``.
                 (default: ``False``)
         """
-
         log_level = LogLevel(log_level)
         for destination in self.destinations:
             destination.log_symlink_artifact(
@@ -197,15 +196,15 @@ def symlink_artifact(
             )
 
     def data_fit(self, data: Dict[str, Any]) -> None:
-        """Helper function for ``self.data(LogLevel.FIT, data)``"""
+        """Helper function for ``self.data(LogLevel.FIT, data)``."""
         self.data(LogLevel.FIT, data)
 
     def data_epoch(self, data: Dict[str, Any]) -> None:
-        """Helper function for ``self.data(LogLevel.EPOCH, data)``"""
+        """Helper function for ``self.data(LogLevel.EPOCH, data)``."""
         self.data(LogLevel.EPOCH, data)
 
     def data_batch(self, data: Dict[str, Any]) -> None:
-        """Helper function for ``self.data(LogLevel.BATCH, data)``"""
+        """Helper function for ``self.data(LogLevel.BATCH, data)``."""
         self.data(LogLevel.BATCH, data)
 
 
@@ -214,6 +213,7 @@ def format_log_data_value(data: Any) -> str:
 
     Args:
         data: Data to format.
+
     Returns:
         str: ``data`` as a string.
     """
diff --git a/composer/loggers/logger_destination.py b/composer/loggers/logger_destination.py
index aa4ac732e9..3527088891 100644
--- a/composer/loggers/logger_destination.py
+++ b/composer/loggers/logger_destination.py
@@ -23,9 +23,7 @@ class LoggerDestination(Callback, ABC):
     :class:`~composer.core.event.Event`. For example, it may be helpful to run on
     :attr:`~composer.core.event.Event.EPOCH_END` to perform any flushing at the end of every epoch.
 
-    Example
-    -------
-
+    Example:
     .. doctest::
 
         >>> from composer.loggers import LoggerDestination
@@ -38,10 +36,6 @@ class LoggerDestination(Callback, ABC):
         ...     loggers=[logger]
         ... )
         Batch 0: {'rank_zero_seed': ...}
-
-    .. testcleanup::
-
-        trainer.engine.close()
     """
 
     def log_data(self, state: State, log_level: LogLevel, data: Dict[str, Any]):
@@ -115,12 +109,15 @@ def log_symlink_artifact(
         symlink_artifact_name: str,
         overwrite: bool,
     ):
-        """Handle creating a symlink of a file artifact stored at ``existing_artifact_name`` to an artifact named
+        """Create a symlink.
+
+        of a file artifact stored at ``existing_artifact_name`` to an artifact named
         ``symlink_artifact_name``.
 
-        Subclasses should implement this method to symlink logged files. However, not all loggers need to implement this
-        method. For example, the :class:`~composer.loggers.tqdm_logger.TQDMLogger` does not implement this method, as it
-        cannot handle file artifacts and thus does not need to do any special symlinking.
+        Subclasses should implement this method to create a symlink of a file artifact stored at
+        ``existing_artifact_name`` to an artifact named ``symlink_artifact_name``.. However, not all loggers need to
+        implement this method. For example, the :class:`~composer.loggers.tqdm_logger.TQDMLogger` does not implement
+        this method, as it cannot handle file artifacts and thus does not need to do any special symlinking.
 
         .. note::
 
diff --git a/composer/loggers/logger_hparams_registry.py b/composer/loggers/logger_hparams_registry.py
index 9d60548bc0..ab5eb84e90 100644
--- a/composer/loggers/logger_hparams_registry.py
+++ b/composer/loggers/logger_hparams_registry.py
@@ -31,8 +31,7 @@
 
 @dataclass
 class ObjectStoreLoggerHparams(hp.Hparams):
-    """:class:`~composer.loggers.in_memory_logger.InMemoryLogger`
-    hyperparameters.
+    """Hyperparameters for the :class:`~.ObjectStoreLogger`.
 
     Args:
         object_store_hparams (LibcloudObjectStoreHparams): The object store provider hparams.
diff --git a/composer/loggers/object_store_logger.py b/composer/loggers/object_store_logger.py
index 9d813a02cf..474e2ee842 100644
--- a/composer/loggers/object_store_logger.py
+++ b/composer/loggers/object_store_logger.py
@@ -43,7 +43,7 @@ def _always_log(state: State, log_level: LogLevel, artifact_name: str):
 
 
 class ObjectStoreLogger(LoggerDestination):
-    """Logger destination that uploads artifacts to an object store.
+    r"""Logger destination that uploads artifacts to an object store.
 
     This logger destination handles calls to :meth:`~composer.loggers.logger.Logger.file_artifact`
     and uploads files to an object store, such as AWS S3 or Google Cloud Storage.
@@ -66,10 +66,6 @@ class ObjectStoreLogger(LoggerDestination):
             loggers=[object_store_logger],
         )
 
-    .. testcleanup:: composer.loggers.object_store_logger.ObjectStoreLogger.__init__
-
-        trainer.engine.close()
-
     .. note::
 
         This callback blocks the training loop to copy each artifact where ``should_log_artifact`` returns ``True``, as
@@ -83,7 +79,7 @@ class ObjectStoreLogger(LoggerDestination):
             always occurs in the background.
 
         *   Provide a RAM disk path for the ``upload_staging_folder`` parameter. Copying files to stage on RAM will be
-            faster than writing to disk. However, there must have sufficient excess RAM, or :exc:`MemoryError`\\s may
+            faster than writing to disk. However, there must have sufficient excess RAM, or :exc:`MemoryError`\s may
             be raised.
 
     Args:
@@ -301,8 +297,8 @@ def log_file_artifact(self, state: State, log_level: LogLevel, artifact_name: st
 
     def log_symlink_artifact(self, state: State, log_level: LogLevel, existing_artifact_name: str,
                              symlink_artifact_name: str, overwrite: bool):
-        """Object stores do not natively support symlinks, so we emulate symlinks by adding a .symlink file to the
-        object store, which is a text file containing the name of the object it is pointing to."""
+        # Object stores do not natively support symlinks, so we emulate symlinks by adding a .symlink file to the
+        # object store, which is a text file containing the name of the object it is pointing to.
         # Only symlink if we're logging artifact to begin with
         if not self.should_log_artifact(state, log_level, existing_artifact_name):
             return
@@ -392,8 +388,7 @@ def _upload_worker(
     container: str,
     provider_kwargs: Optional[Dict[str, Any]],
 ):
-    """A long-running function to handle uploading files to the object store specified by (``provider``, ``container``,
-    ``provider_kwargs``).
+    """A long-running function to handle uploading files to the object store.
 
     The worker will continuously poll ``file_queue`` for files to upload. Once ``is_finished`` is set, the worker will
     exit once ``file_queue`` is empty.
diff --git a/composer/loggers/wandb_logger.py b/composer/loggers/wandb_logger.py
index 06252669e9..36f961e91f 100644
--- a/composer/loggers/wandb_logger.py
+++ b/composer/loggers/wandb_logger.py
@@ -1,7 +1,7 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""Log to Weights and Biases (https://wandb.ai/)"""
+"""Log to `Weights and Biases <https://wandb.ai/>`_."""
 
 from __future__ import annotations
 
@@ -25,7 +25,7 @@
 
 
 class WandBLogger(LoggerDestination):
-    """Log to Weights and Biases (https://wandb.ai/)
+    """Log to `Weights and Biases <https://wandb.ai/>`_.
 
     Args:
         project (str, optional): WandB project name.
diff --git a/composer/loss/loss.py b/composer/loss/loss.py
index 328e7b69b7..fadc461c06 100644
--- a/composer/loss/loss.py
+++ b/composer/loss/loss.py
@@ -27,9 +27,7 @@ def binary_cross_entropy_with_logits(
     pos_weight: Optional[Tensor] = None,
     scale_by_batch_size: Optional[bool] = True,
 ) -> torch.Tensor:
-    r"""Replacement for
-    :class:`~torch.nn.functional.binary_cross_entropy_with_logits` that can handle class
-    indices or one-hot labels.
+    r"""Replacement for :class:`~F.binary_cross_entropy_with_logits` that handles class indices or one-hot labels.
 
     :class:`~torch.nn.functional.binary_cross_entropy_with_logits` with ``reduction =
     'mean'` will typically result in very small loss values (on the order of 1e-3), which
@@ -83,8 +81,12 @@ def soft_cross_entropy(input: Tensor,
                        ignore_index: int = -100,
                        reduce: Optional[bool] = None,
                        reduction: str = 'mean'):
-    r"""Drop-in replacement for :class:`~torch.nn.functional.cross_entropy` that can
-     handle class indices or one-hot labels.
+    r"""Drop-in replacement for :class:`~.F.cross_entropy` that handles class indices or one-hot labels.
+
+    .. note::
+
+        This function will be obsolete with `this update <https://github.com/pytorch/pytorch/pull/61044>`_.
+
     Args:
         input (torch.Tensor) : :math:`(N, C)` where `C = number of classes` or :math:`(N, C, H, W)`
             in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)` where :math:`K \geq 1`
@@ -116,7 +118,6 @@ def soft_cross_entropy(input: Tensor,
             elements in the output, ``'sum'``: the output will be summed. Note: ``size_average``
             and ``reduce`` are in the process of being deprecated, and in the meantime,
             specifying either of those two args will override ``reduction``. Default: ``'mean'``
-    This function will be obsolete with `this update <https://github.com/pytorch/pytorch/pull/61044>`_.
     """
     target_type = infer_target_type(input, target)
 
diff --git a/composer/loss/utils.py b/composer/loss/utils.py
index 2d8980b1aa..349cd0496f 100644
--- a/composer/loss/utils.py
+++ b/composer/loss/utils.py
@@ -34,12 +34,13 @@ def ensure_targets_one_hot(input: torch.Tensor,
                            targets: torch.Tensor,
                            num_classes: Optional[float] = None) -> torch.Tensor:
     r"""Ensures that the targets are in a one-hot format rather than an index format.
+
     Args:
         input (torch.Tensor): :math:`(N, C)` where `C = number of classes` or :math:`(N, C, H, W)`
             in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)` where :math:`K \geq 1`
             in the case of K-dimensional loss. `input` is expected to contain unnormalized scores
             (often referred to as logits).
-        target (torch.Tensor) : If containing class indices, shape :math:`(N)` where each value is
+        targets (torch.Tensor) : If containing class indices, shape :math:`(N)` where each value is
             :math:`0 \leq \text{targets}[i] \leq C-1`, or :math:`(N, d_1, d_2, ..., d_K)` with
             :math:`K \geq 1` in the case of K-dimensional loss. If containing class probabilities,
             same shape as the input.
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index 236cd26c42..9ff1192bc5 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -84,7 +84,6 @@ def update(self, output: Union[Mapping, Tensor], target: Tensor) -> None:
                 either the Tensor or a Mapping type that contains the loss or model logits.
             target (~torch.Tensor): A Tensor of ground-truth values to compare against.
         """
-
         assert isinstance(output, Tensor)
         output = output.view(-1, self.vocab_size)
         target = target.view(-1)
@@ -178,7 +177,6 @@ def update(self, output: Union[Mapping, Tensor], target: Tensor) -> None:
                 either the Tensor or a Mapping type that contains the loss or model logits.
             target (~torch.Tensor): A Tensor of ground-truth values to compare against.
         """
-
         # if logit modification algorithms aren't on, we take the loss directly from the model output
         if isinstance(output, Mapping) and 'loss' in output:
             loss = output['loss']
diff --git a/composer/models/efficientnetb0/_layers.py b/composer/models/efficientnetb0/_layers.py
index 15d1e23643..ab12aec9c3 100644
--- a/composer/models/efficientnetb0/_layers.py
+++ b/composer/models/efficientnetb0/_layers.py
@@ -13,8 +13,9 @@ def round_channels(
     divisor: int = 8,
     min_value: Optional[int] = None,
 ) -> int:
-    """Round number of channels after scaling with width multiplier. This function ensures that channel integers halfway
-    inbetween divisors is rounded up.
+    """Round number of channels after scaling with width multiplier.
+
+    This function ensures that channel integers halfway in-between divisors is rounded up.
 
     Args:
         channels (float): Number to round.
@@ -23,7 +24,6 @@ def round_channels(
         min_value (int, optional): Minimum value the output can be. If not specified, defaults
             to the ``divisor``.
     """
-
     if not width_multiplier:
         return int(channels)
     channels *= width_multiplier
@@ -37,7 +37,6 @@ def round_channels(
 
 def calculate_same_padding(kernel_size, dilation, stride):
     """Calculates the amount of padding to use to get the "SAME" functionality in Tensorflow."""
-
     return ((stride - 1) + dilation * (kernel_size - 1)) // 2
 
 
@@ -49,7 +48,6 @@ def drop_connect(inputs: torch.Tensor, drop_connect_rate: float, training: bool)
         drop_connect_rate (float): Probability of droppping each sample.
         training (bool): Whether or not the model is training
     """
-
     if not training:
         return inputs
 
@@ -175,24 +173,26 @@ def forward(self, input: torch.Tensor):
 
 
 class MBConvBlock(nn.Module):
-    """Mobile Inverted Residual Bottleneck Block as defined in
+    """Mobile Inverted Residual Bottleneck Block.
+
+    This block is implemented as as defined in
     `MobileNetV2: Inverted Residuals and Linear Bottlenecks <https://arxiv.org/abs/1801.04381>`_ (Sandler et al, 2018).
 
-        Args:
-            in_channels (int): Number of channels in the input tensor.
-            out_channels (int): Number of channels in the output tensor.
-            kernel_size (int): Size of the convolving kernel.
-            stride (int): Stride of the convolution.
-            expand_ratio (int): How much to expand the input channels for the
-                depthwise convolution.
-            se_ratio (float): How much to scale `in_channels` for the hidden layer
-                dimensionality of the squeeze-excite module.
-            drop_connect_rate (float): Probability of dropping a sample before the
-                identity connection, provides regularization similar to stochastic
-                depth.
-            act_layer (torch.nn.Module): Activation layer to use in block.
-            norm_kwargs (dict): Normalization layer's keyword arguments.
-            norm_layer (torch.nn.Module): Normalization layer to use in block.
+    Args:
+        in_channels (int): Number of channels in the input tensor.
+        out_channels (int): Number of channels in the output tensor.
+        kernel_size (int): Size of the convolving kernel.
+        stride (int): Stride of the convolution.
+        expand_ratio (int): How much to expand the input channels for the
+            depthwise convolution.
+        se_ratio (float): How much to scale `in_channels` for the hidden layer
+            dimensionality of the squeeze-excite module.
+        drop_connect_rate (float): Probability of dropping a sample before the
+            identity connection, provides regularization similar to stochastic
+            depth.
+        act_layer (torch.nn.Module): Activation layer to use in block.
+        norm_kwargs (dict): Normalization layer's keyword arguments.
+        norm_layer (torch.nn.Module): Normalization layer to use in block.
     """
 
     def __init__(self,
diff --git a/composer/models/initializers.py b/composer/models/initializers.py
index a2f5c5a0f8..ca7b11d70f 100644
--- a/composer/models/initializers.py
+++ b/composer/models/initializers.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Module Initializers."""
+
 from typing import Callable
 
 import torch
@@ -19,6 +21,11 @@ class Initializer(StringEnum):
     LINEAR_LOG_CONSTANT_BIAS = "linear_log_constant_bias"
 
     def get_initializer(self) -> Callable[[torch.nn.Module], None]:
+        """Get the initializer function.
+
+        Returns:
+            (torch.nn.Module) -> None: The initializer function.
+        """
 
         def kaiming_normal(w: nn.Module):
             if isinstance(w, torch.nn.Linear) or isinstance(w, torch.nn.Conv2d):
diff --git a/composer/models/model_hparams.py b/composer/models/model_hparams.py
index 3c48e026fe..e878e6b375 100644
--- a/composer/models/model_hparams.py
+++ b/composer/models/model_hparams.py
@@ -35,8 +35,8 @@ class ModelHparams(hp.Hparams, ABC):
 
     @abstractmethod
     def initialize_object(self) -> ComposerModel:
-        """Invoked by the :meth:`~composer.trainer.trainer_hparams.TrainerHparams.initialize_object` to construct a
-        :class:`.ComposerModel`.
+        """
+        Construct a :class:`.ComposerModel`.
 
         Returns:
             ComposerModel: The constructed :class:`.ComposerModel`
diff --git a/composer/models/transformer_hparams.py b/composer/models/transformer_hparams.py
index 4781e11e61..8d719411af 100644
--- a/composer/models/transformer_hparams.py
+++ b/composer/models/transformer_hparams.py
@@ -1,8 +1,7 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""General `YAHP <https://docs.mosaicml.com/projects/yahp/en/stable/README.html>`_ interface for
-ComposerTransformers."""
+"""YAHP :class:`.hp.Hparams` hyperparameters for ComposerTransformers."""
 
 from abc import ABC
 from dataclasses import dataclass
diff --git a/composer/models/transformer_shared.py b/composer/models/transformer_shared.py
index 872143690b..3816f30c77 100644
--- a/composer/models/transformer_shared.py
+++ b/composer/models/transformer_shared.py
@@ -82,7 +82,7 @@ def loss(self, outputs: Mapping, batch: Batch) -> Union[Tensor, Sequence[Tensor]
         raise NotImplementedError("A model-specific loss function must be written.")
 
     def forward(self, batch: Batch) -> Mapping:
-        """Runs the forward pass of the model.
+        """Run the forward pass of the model.
 
         Args:
             batch (~composer.core.types.Batch): A dictionary of Dict[str, Tensor] of inputs that the
diff --git a/composer/optim/decoupled_weight_decay.py b/composer/optim/decoupled_weight_decay.py
index 505ad384fa..bb40101264 100644
--- a/composer/optim/decoupled_weight_decay.py
+++ b/composer/optim/decoupled_weight_decay.py
@@ -57,7 +57,6 @@ def sgdw(params: List[torch.Tensor], d_p_list: List[torch.Tensor], momentum_buff
             dampening (float): Dampening factor for momentum update
             nesterov (bool): Enables Nesterov momentum updates
         """
-
         for i, param in enumerate(params):
 
             d_p = d_p_list[i]
@@ -188,7 +187,6 @@ def adamw(params: List[torch.Tensor], grads: List[torch.Tensor], exp_avgs: List[
             weight_decay (float): Factor for decoupled weight decay
             eps (float): Term added to the denominator to improve numerical stability.
         """
-
         for i, param in enumerate(params):
             grad = grads[i]
             exp_avg = exp_avgs[i]
diff --git a/composer/optim/optimizer_hparams_registry.py b/composer/optim/optimizer_hparams_registry.py
index 13c2849281..fcda1091d2 100644
--- a/composer/optim/optimizer_hparams_registry.py
+++ b/composer/optim/optimizer_hparams_registry.py
@@ -45,8 +45,8 @@ def initialize_object(
         """Initializes the optimizer.
 
         Args:
-            param_group (Iterable[torch.Tensor] | Iterable[Dict[str, torch.Tensor]]):
-                Parameters for this optimizer to optimize.
+            param_group (Iterable[torch.Tensor] | Iterable[Dict[str, torch.Tensor]]): Parameters for
+                this optimizer to optimize.
         """
         if self.optimizer_cls is None:
             raise ValueError(f"{type(self).__name__}.optimizer_cls must be defined")
diff --git a/composer/optim/scheduler.py b/composer/optim/scheduler.py
index 6b8fcf7b49..fad4cbf88d 100644
--- a/composer/optim/scheduler.py
+++ b/composer/optim/scheduler.py
@@ -164,7 +164,6 @@ def compile_composer_scheduler(scheduler: ComposerScheduler, state: State, ssr:
     Returns:
         compiled_scheduler (PyTorchScheduler): The scheduler, in a form compatible with PyTorch scheduler interfaces.
     """
-
     optimizers = state.optimizers
     if len(optimizers) != 1:
         raise NotImplementedError("Providing functional schedulers is unsupported with multiple optimizers.")
@@ -380,7 +379,6 @@ def _cosine_anneal(x: float, min_y: float = 0.0, max_y: float = 1.0) -> float:
     Curve is cos(x) on domain [0, pi], stretched to the domain [0, 1] and range [min_y, max_y]. Additionally, param x is
     clipped to the interval [0, 1]
     """
-
     x = min(max(x, 0.0), 1.0)
     return min_y + (max_y - min_y) * (1 + math.cos(x * math.pi)) / 2
 
diff --git a/composer/profiler/json_trace_handler.py b/composer/profiler/json_trace_handler.py
index d237c8c50b..a0426606c5 100644
--- a/composer/profiler/json_trace_handler.py
+++ b/composer/profiler/json_trace_handler.py
@@ -28,9 +28,11 @@
 __all__ = ["JSONTraceHandler"]
 
 
-class JSONTraceHandler(TraceHandler):
-    __doc__ = f"""Records trace events in `JSON trace format <https://\\
-    docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview>`_.
+class JSONTraceHandler(TraceHandler):  # noqa: D101
+    __doc__ = f"""Records trace events in Chrome JSON trace format.
+
+    See `this document <https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview>`_
+    for more information.
 
     Traces are output to ``output_directory``.  Traces can be visualized using the Chrome Trace Viewer.
     To view in a Google Chrome browser, navigate to ``chrome://tracing`` and load the JSON trace file.
diff --git a/composer/profiler/json_trace_merger.py b/composer/profiler/json_trace_merger.py
index 58274001fd..09f7e0b7cc 100644
--- a/composer/profiler/json_trace_merger.py
+++ b/composer/profiler/json_trace_merger.py
@@ -75,7 +75,6 @@ def merge_traces(output_file: Union[str, pathlib.Path], *trace_files: Union[str,
         output_file (str | pathlib.Path): The file to write the merged trace to
         trace_files (str | pathlib.Path): Variable number of trace files to merge together
     """
-
     ranks_to_clock_sync = _get_rank_to_clock_syncs(trace_files)
     rank_to_backwards_thread = {}
     rank_to_seen_threads = {rank: set() for rank in ranks_to_clock_sync.keys()}
diff --git a/composer/profiler/marker.py b/composer/profiler/marker.py
index 6cb05ef90f..aa4c3ce30a 100644
--- a/composer/profiler/marker.py
+++ b/composer/profiler/marker.py
@@ -118,7 +118,7 @@ def _record_counter_event(self, wall_clock_time_ns: int, timestamp: Timestamp,
     def start(self) -> None:
         """Record the start of a duration event.
 
-        To record the duration of an event, invoke :meth:`.Marker.start` followed by :meth:`.Marker.finish`\\:
+        To record the duration of an event, invoke :meth:`.Marker.start` followed by :meth:`.Marker.finish`.
 
         .. testsetup::
 
diff --git a/composer/profiler/profiler.py b/composer/profiler/profiler.py
index 51879bf2fc..9b88a404c7 100644
--- a/composer/profiler/profiler.py
+++ b/composer/profiler/profiler.py
@@ -171,8 +171,10 @@ def trace_handlers(self, trace_handlers: Union[TraceHandler, Sequence[TraceHandl
         self._trace_handlers[:] = ensure_tuple(trace_handlers)
 
     def record_chrome_json_trace_file(self, filepath: Union[str, pathlib.Path]):
-        """Record trace events in `Chrome JSON format <https://\\
-        docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview>`_ in the trace handlers.
+        """Record trace events in Chrome JSON format in the trace handlers.
+
+        See `this document <https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview>`_
+        for more information about Chrome JSON format.
 
         .. note::
 
diff --git a/composer/profiler/profiler_action.py b/composer/profiler/profiler_action.py
index 5f3047edc9..f86d39c4d6 100644
--- a/composer/profiler/profiler_action.py
+++ b/composer/profiler/profiler_action.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Action states for the :class:`Profiler` that define whether or not events are being recorded to the trace file."""
+
 from composer.utils.string_enum import StringEnum
 
 __all__ = ["ProfilerAction"]
diff --git a/composer/profiler/torch_profiler.py b/composer/profiler/torch_profiler.py
index c2cd0d0b87..59c265bae4 100644
--- a/composer/profiler/torch_profiler.py
+++ b/composer/profiler/torch_profiler.py
@@ -25,7 +25,7 @@
 __all__ = ["TorchProfiler"]
 
 
-class TorchProfiler(Callback):
+class TorchProfiler(Callback):  # noqa: D101
     __doc__ = f"""Profile the execution using the :class:`PyTorch Profiler <torch.profiler.profile>`.
 
     Profiling results are stored in TensorBoard format in the directory specified by ``folder``.
diff --git a/composer/profiler/trace_handler.py b/composer/profiler/trace_handler.py
index 93ae1438d1..4413fa9d3a 100644
--- a/composer/profiler/trace_handler.py
+++ b/composer/profiler/trace_handler.py
@@ -90,8 +90,10 @@ def process_counter_event(
         pass
 
     def process_chrome_json_trace_file(self, filepath: pathlib.Path) -> None:
-        """Invoked when there are events in `Chrome JSON format <https://\\
-        docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview>`_ to record.
+        """Invoked when there are events in Chrome JSON format to record.
+
+        See `this document <https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview>`_
+        for more information.
 
         Args:
             filepath (pathlib.Path): The filepath to a Chrome JSON trace file.
diff --git a/composer/trainer/_deepspeed.py b/composer/trainer/_deepspeed.py
index 1ab00e3d73..7a7cd742e8 100644
--- a/composer/trainer/_deepspeed.py
+++ b/composer/trainer/_deepspeed.py
@@ -148,7 +148,6 @@ def _parse_deepspeed_config(
             to the trainer.
         RuntimeError: If the batch size of the train dataloader in the provided state is not set.
     """
-
     new_config = copy.deepcopy(config)
     _add_batch_config(new_config, state)
     _ensure_no_optim_in_config(new_config)
@@ -178,7 +177,6 @@ def _fix_batch_precision_for_deepspeed(batch: Batch, precision: Precision) -> Ba
     Returns:
         Batch: The batch with it's precision adjusted to the specified precision.
     """
-
     if precision != Precision.FP16:
         return batch
 
diff --git a/composer/trainer/_scaler.py b/composer/trainer/_scaler.py
index 02aec05883..f08e5ce488 100644
--- a/composer/trainer/_scaler.py
+++ b/composer/trainer/_scaler.py
@@ -62,7 +62,6 @@ def step(self, optimizer: Optimizer, *args, **kwargs):
         Always called before the optimizer step. Checks if the optimizer can handle AMP closures (currently only
         Composer's SAM optimizer) If so, it passes an AMP-modified closure to the optimizer.
         """
-
         closure = kwargs["closure"]
 
         def _amp_closure(**kwargs):
diff --git a/composer/trainer/devices/device.py b/composer/trainer/devices/device.py
index 2b297ca097..204a7a11f7 100644
--- a/composer/trainer/devices/device.py
+++ b/composer/trainer/devices/device.py
@@ -56,9 +56,10 @@ def tensor_to_device(self, tensor: torch.Tensor) -> torch.Tensor:
         pass
 
     def batch_to_device(self, batch: T_Batch) -> T_Batch:
-        """Invoked by the :class:`.Trainer` move all tensors items in a batch to device. Supports nested sequences and
-        mappings of tensors. Ignores non-tensor items. Preserves sequence and types when possible, otherwise converts
-        sequences to lists. Converts mappings to dictionaries.
+        """Invoked by the :class:`.Trainer` move all tensors items in a batch to device.
+
+        Supports nested sequences and mappings of tensors. Ignores non-tensor items. Preserves sequence and mapping types
+        when possible; otherwise, sequences are converted to lists, and mappings are converted to dictionaries.
 
         Args:
             batch (Any): The batch to move to the device.
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index 825e3a4f92..bd57dc818f 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -1,7 +1,7 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""Train models!"""
+"""Train models."""
 
 from __future__ import annotations
 
@@ -483,10 +483,6 @@ class Trainer:
                     load_path=checkpoint_path,
                     load_object_store=store,
                 )
-
-            .. testcleanup::
-
-                trainer.engine.close()
         load_weights_only (bool, optional): Whether or not to only restore the weights from the checkpoint without
             restoring the associated state. Ignored if ``load_path`` is ``None``. (default: ``False``)
         load_strict_model_weights (bool, optional): Ensure that the set of weights in the checkpoint and model must exactly match.
@@ -920,24 +916,25 @@ def __init__(
         # Load Checkpoint
         self._rng_state = None
         # If autoresume is enabled, first check for existing checkpoints to load
-        latest_checkpoint_path = self._check_for_autoresume(autoresume=autoresume,
-                                                            save_folder=save_folder,
-                                                            save_overwrite=save_overwrite,
-                                                            save_latest_filename=save_latest_filename,
-                                                            run_name=run_name,
-                                                            save_latest_artifact_name=save_latest_artifact_name,
-                                                            loggers=loggers,
-                                                            load_chunk_size=load_chunk_size,
-                                                            load_progress_bar=load_progress_bar)
-        # Found latest checkpoint path, load that instead
-        if latest_checkpoint_path:
-            load_path = latest_checkpoint_path
-            # Disable object_store and load_weights_only since we're autoresuming locally
-            load_object_store = None
-            load_weights_only = False
-            log.info(
-                f"Found latest checkpoint at {latest_checkpoint_path}, loading instead of load_path {load_path} as autoresume enabled."
-            )
+        if autoresume:
+            latest_checkpoint_path = self._determine_autoresume_load_path(
+                save_folder=save_folder,
+                save_overwrite=save_overwrite,
+                save_latest_filename=save_latest_filename,
+                run_name=run_name,
+                save_latest_artifact_name=save_latest_artifact_name,
+                loggers=loggers,
+                load_chunk_size=load_chunk_size,
+                load_progress_bar=load_progress_bar)
+            # Found latest checkpoint path, load that instead
+            if latest_checkpoint_path:
+                load_path = latest_checkpoint_path
+                # Disable object_store and load_weights_only since we're autoresuming locally
+                load_object_store = None
+                load_weights_only = False
+                log.info(
+                    f"Found latest checkpoint at {latest_checkpoint_path}, loading instead of load_path {load_path} as autoresume enabled."
+                )
         # Actually load the checkpoint from potentially updated arguments
         if load_path is not None:
             self._rng_state = load_checkpoint(state=self.state,
@@ -997,50 +994,58 @@ def saved_checkpoints(self) -> List[Tuple[Timestamp, List[pathlib.Path]]]:
             return []
         return self._checkpoint_saver.saved_checkpoints
 
-    def _check_for_autoresume(self, autoresume: bool, save_folder: Optional[str], save_overwrite: bool,
-                              save_latest_filename: str, run_name: Optional[str], save_latest_artifact_name: str,
-                              loggers: List[LoggerDestination], load_chunk_size: int, load_progress_bar: bool):
-        """If autoresume is enabled, check for latest checkpoint locally. If none is found, check loggers
+    def _determine_autoresume_load_path(
+        self,
+        save_folder: Optional[str],
+        save_overwrite: bool,
+        save_latest_filename: str,
+        run_name: Optional[str],
+        save_latest_artifact_name: str,
+        loggers: List[LoggerDestination],
+        load_chunk_size: int,
+        load_progress_bar: bool,
+    ):
+        """Determines the load path when using autoresume.
+
+        First, check for latest checkpoint locally. If none is found, check loggers
         for checkpoint. If any checkpoint is found, load that instead of load_path. If none are found,
         use the user specified load_path.
         """
-        if autoresume:
-            if save_folder is None:
-                raise ValueError("save_folder must be specified when autoresume is enabled.")
-            if save_overwrite:
-                raise ValueError(
-                    "save_overwrite must be False when autoresume is enabled as autoresume always loads the latest existing checkpoint in save_folder."
-                )
-            if save_latest_filename is None:
-                raise ValueError(
-                    "save_latest_filename must be specified so autoresume knows where to load checkpoints from.")
-            if run_name is None:
-                raise ValueError("run_name must be specified so autoresume knows which run to load from.")
-            latest_filename_formatted = format_name_with_dist(save_latest_filename, run_name)
-            latest_checkpoint_path = os.path.join(save_folder, latest_filename_formatted)
-            # If latest checkpoint is not saved locally, try to fetch from loggers
-            if not os.path.exists(latest_checkpoint_path) and save_latest_artifact_name is not None:
-                # Make save folder in case it doesn't exist so latest checkpoint can be downloaded
-                os.makedirs(save_folder, exist_ok=True)
-                for logger in loggers:
-                    try:
-                        # Fetch from logger. If it succeeds, stop trying the rest of the loggers
-                        logger.get_file_artifact(artifact_name=format_name_with_dist(
-                            save_latest_artifact_name, run_name),
-                                                 destination=latest_checkpoint_path,
-                                                 chunk_size=load_chunk_size,
-                                                 progress_bar=load_progress_bar)
-                        break
-                    except (NotImplementedError, GetFileNotFoundException):
-                        # Ignore errors caused by no checkpoint saved with logger
-                        pass
-            # Require all ranks to have local checkpoint if we wish to restore from it
-            latest_checkpoint_exists = self._device.tensor_to_device(
-                torch.tensor([os.path.exists(latest_checkpoint_path)], dtype=torch.uint8))
-            dist.all_reduce(latest_checkpoint_exists, reduce_operation="MIN")
-            # If latest checkpoint is saved locally, change load_path to it
-            if int(latest_checkpoint_exists.item()) == 1:
-                return latest_checkpoint_path
+        if save_folder is None:
+            raise ValueError("save_folder must be specified when autoresume is enabled.")
+        if save_overwrite:
+            raise ValueError(
+                "save_overwrite must be False when autoresume is enabled as autoresume always loads the latest existing checkpoint in save_folder."
+            )
+        if save_latest_filename is None:
+            raise ValueError(
+                "save_latest_filename must be specified so autoresume knows where to load checkpoints from.")
+        if run_name is None:
+            raise ValueError("run_name must be specified so autoresume knows which run to load from.")
+        latest_filename_formatted = format_name_with_dist(save_latest_filename, run_name)
+        latest_checkpoint_path = os.path.join(save_folder, latest_filename_formatted)
+        # If latest checkpoint is not saved locally, try to fetch from loggers
+        if not os.path.exists(latest_checkpoint_path) and save_latest_artifact_name is not None:
+            # Make save folder in case it doesn't exist so latest checkpoint can be downloaded
+            os.makedirs(save_folder, exist_ok=True)
+            for logger in loggers:
+                try:
+                    # Fetch from logger. If it succeeds, stop trying the rest of the loggers
+                    logger.get_file_artifact(artifact_name=format_name_with_dist(save_latest_artifact_name, run_name),
+                                             destination=latest_checkpoint_path,
+                                             chunk_size=load_chunk_size,
+                                             progress_bar=load_progress_bar)
+                    break
+                except (NotImplementedError, GetFileNotFoundException):
+                    # Ignore errors caused by no checkpoint saved with logger
+                    pass
+        # Require all ranks to have local checkpoint if we wish to restore from it
+        latest_checkpoint_exists = self._device.tensor_to_device(
+            torch.tensor([os.path.exists(latest_checkpoint_path)], dtype=torch.uint8))
+        dist.all_reduce(latest_checkpoint_exists, reduce_operation="MIN")
+        # If latest checkpoint is saved locally, change load_path to it
+        if int(latest_checkpoint_exists.item()) == 1:
+            return latest_checkpoint_path
 
     def fit(
         self,
@@ -1173,8 +1178,7 @@ def fit(
                 If ``reset_time`` is True, then :attr:`.State.max_duration` will be set to this parameter.
 
             optimizers (torch.optim.Optimizer | Sequence[torch.optim.Optimizer], optional): See :class:`.Trainer`.
-            schedulers (PyTorchScheduler | ComposerScheduler | Sequence[PyTorchScheduler | ComposerScheduler], optional):
-                See :class:`.Trainer`.
+            schedulers (PyTorchScheduler | ComposerScheduler | Sequence[PyTorchScheduler | ComposerScheduler], optional): See :class:`.Trainer`.
             scale_schedule_ratio (float, optional): See :class:`.Trainer`.
             step_schedulers_every_batch (bool, optional): See :class:`.Trainer`.
             eval_dataloader (Iterable | DataSpec | Evaluator | Sequence[Evaluator], optional): See :class:`.Trainer`.
@@ -1552,7 +1556,6 @@ def _train_loop(self) -> None:
 
     def _run_evaluators(self, event: Event, log_level: LogLevel):
         """Runs evaluators periodically during training."""
-
         for evaluator in self.state.evaluators:
             assert evaluator.eval_interval is not None, "eval_interval should have been set on __init__() or fit()"
             assert evaluator.subset_num_batches is not None, "subset_num_batches should have been set on __init__() or fit()"
@@ -1566,8 +1569,9 @@ def _run_evaluators(self, event: Event, log_level: LogLevel):
                 )
 
     def _train_batch(self, use_grad_scaling: bool):
-        """Compute loss by training on a full batch of data. Adaptively change microbatch size if enabled to maximize
-        GPU usage.
+        """Compute loss by training on a full batch of data.
+
+        Adaptively change microbatch size if enabled to maximize GPU usage.
 
         Args:
             use_grad_scaling (bool): Enables gradient scaling
@@ -1699,6 +1703,7 @@ def _train_microbatch(self, use_grad_scaling: bool, current_batch_size: int, tot
 
         Args:
             use_grad_scaling (bool): Whether to use gradient scaling.
+            current_batch_size (int): The current batch size.
             minibatch_num_samples (int): Number of samples in the minibatch.
             total_loss (torch.Tensor): Total loss aggregated across all microbatches.
             is_final_microbatch (bool): If current microbatch is the last one.
@@ -1775,7 +1780,6 @@ def predict(self, dataloader: Union[DataLoader, DataSpec], subset_num_batches: i
                 on this many batches. This parameter has no effect if it is greater than ``len(dataloader)``.
                 If ``-1``, then the entire loader will be iterated over. (default: ``-1``)
         """
-
         if isinstance(dataloader, DataSpec):
             data_spec = dataloader
         else:
diff --git a/composer/trainer/trainer_hparams.py b/composer/trainer/trainer_hparams.py
index 68110a6fb0..8655e9a4a1 100644
--- a/composer/trainer/trainer_hparams.py
+++ b/composer/trainer/trainer_hparams.py
@@ -879,8 +879,7 @@ class ExperimentHparams(hp.Hparams):
     evals: List[EvalHparams] = hp.optional("Eval hparams", default_factory=list)
 
     def initialize_object(self) -> Tuple[Trainer, List[FitKwargs], List[EvalKwargs]]:
-        """Construct the :class:`.Trainer`, :meth:`~Trainer.fit` kwargs, and
-        :meth:`~Trainer.eval` kwargs.
+        """Construct the :class:`.Trainer`, :meth:`~Trainer.fit` kwargs, and :meth:`~Trainer.eval` kwargs.
 
         Returns:
             Tuple[Trainer, List[FitKwargs], List[EvalKwargs]]: A tuple of the
diff --git a/composer/utils/batch_helpers.py b/composer/utils/batch_helpers.py
index 9c1afd5d1c..c3e066c94c 100644
--- a/composer/utils/batch_helpers.py
+++ b/composer/utils/batch_helpers.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Helpers to get items and set items in a batch."""
+
 from operator import attrgetter, itemgetter
 from typing import Any, Callable, Sequence, Union
 
@@ -170,7 +172,7 @@ def _batch_set_multiple(batch: Any, key: Any, value: Any) -> Any:
 
 
 def _batch_set_tuple(batch: Any, key: Union[int, str], value: Any) -> Any:
-    """"Sets key value pairs in tuples and NamedTuples."""
+    """Sets key value pairs in tuples and NamedTuples."""
     if hasattr(batch, '_fields'):  # NamedTuple
         if isinstance(key, str):
             batch = batch._replace(**{key: value})
diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py
index 0e4e7d1342..9a4bcf01a5 100644
--- a/composer/utils/checkpoint.py
+++ b/composer/utils/checkpoint.py
@@ -293,11 +293,13 @@ def _restore_checkpoint(
         return state_dict['rng']
 
 
-def save_checkpoint(state: State,
-                    logger: Logger,
-                    filename: str = "ep{epoch}-ba{batch}-rank{rank}",
-                    *,
-                    weights_only: bool = False) -> List[pathlib.Path]:
+def save_checkpoint(
+    state: State,
+    logger: Logger,
+    filename: str = "ep{epoch}-ba{batch}-rank{rank}",
+    *,
+    weights_only: bool = False,
+) -> List[pathlib.Path]:  # noqa: D103
     state_dict = {
         'state': state.state_dict(),
         'rng': reproducibility.get_rng_state(),
diff --git a/composer/utils/collect_env.py b/composer/utils/collect_env.py
index 921651197b..41cfd4fdb3 100644
--- a/composer/utils/collect_env.py
+++ b/composer/utils/collect_env.py
@@ -111,7 +111,7 @@ def get_host_processor_name() -> str:
 
 
 def get_host_processor_cores() -> int:
-    """Determine the number of physical host processor cores."""
+    """Determines the number of physical host processor cores."""
     return psutil.cpu_count(logical=False)
 
 
@@ -126,7 +126,7 @@ def get_accel_model_name() -> str:
 
 
 def get_local_world_size() -> int:
-    """Determine the number of accelerators per node."""
+    """Determines the number of accelerators per node."""
     return dist.get_local_world_size() if cuda_available() else 0
 
 
@@ -140,7 +140,7 @@ def get_cuda_device_count() -> int:
 
 
 def _exc_report(exc_type) -> None:
-    """Produces exception report (exception message + environment report)
+    """Produces exception report (exception message + environment report).
 
     Args:
         exc_type (Exception): Type of exception.
@@ -230,7 +230,6 @@ def configure_excepthook() -> None:
         >>> sys.excepthook
         <function _custom_exception_handler at ...>
     """
-
     global _EXCEPTHOOK_REGISTERED
     # Needs to be indempotent across multiple trainers, don't register if we've already registered
     if not _EXCEPTHOOK_REGISTERED:
@@ -268,7 +267,6 @@ def get_torch_env() -> str:
 # Get Composer environment info
 def get_composer_env() -> str:
     """Query Composer pertinent system information."""
-
     mutable_dict = ComposerEnv(
         composer_version=get_composer_version(),
         host_processor_model_name=get_host_processor_name(),
@@ -287,7 +285,6 @@ def print_env(file: Optional[TextIO] = None) -> None:
     """Generate system information report.
 
     Example:
-
     .. code-block:: python
 
         from composer.utils.collect_env import print_env
@@ -363,7 +360,6 @@ def print_env(file: Optional[TextIO] = None) -> None:
     Args:
         file (TextIO, optional): File handle, `sys.stdout` or `sys.stderr`. Defaults to `sys.stdout`.
     """
-
     # Set stdout during runtime if no output file is specified
     if file is None:
         file = sys.stdout
diff --git a/composer/utils/dist.py b/composer/utils/dist.py
index a128061e59..4088658e15 100644
--- a/composer/utils/dist.py
+++ b/composer/utils/dist.py
@@ -130,7 +130,9 @@ def get_local_rank() -> int:
 
 
 def get_node_rank() -> int:
-    """Returns the node rank. For example, if there are 2 nodes, and 2 ranks per node, then global ranks 0-1 will have a
+    """Returns the node rank.
+
+    For example, if there are 2 nodes, and 2 ranks per node, then global ranks 0-1 will have a
     node rank of 0, and global ranks 2-3 will have a node rank of 1.
 
     Returns:
@@ -239,6 +241,7 @@ def broadcast_object_list(object_list: List[Any], src: int = 0) -> None:
             Each object must be picklable. Only objects on the ``src`` rank will be broadcast,
             but each rank must provide lists of equal sizes.
         src (int, optional): Source rank (default: ``0``)
+
     Returns:
         None:  ``object_list`` will be modified in-place and set to values of ``object_list`` from the ``src`` rank.
     """
@@ -257,8 +260,7 @@ def broadcast_object_list(object_list: List[Any], src: int = 0) -> None:
 
 
 def all_gather(tensor: torch.Tensor) -> Sequence[torch.Tensor]:
-    """Collects a :class:`~torch.Tensor` from each rank and return a sequence of
-    :class:`~torch.Tensor`\\s indexed by rank.
+    """Collects a :class:`~torch.Tensor` from each rank.
 
     .. seealso:: :func:`torch.distributed.all_gather`
 
@@ -350,7 +352,7 @@ def initialize_dist(backend: str, timeout: datetime.timedelta):
     Args:
         backend (str): The distributed backend to use. Should be ``gloo`` for CPU training,
             or ``nccl`` for GPU training.
-        timeout (datetime.timedelta): The timeout for operations exected against the process group.
+        timeout (datetime.timedelta): The timeout for operations executed against the process group.
     """
     if get_world_size() > 1 and not dist.is_available():
         raise RuntimeError("When the world size is > 1, ``torch.distributed`` must be used. However, it is "
@@ -392,13 +394,15 @@ def initialize_dist(backend: str, timeout: datetime.timedelta):
 
 
 def get_sampler(dataset: torch.utils.data.Dataset, *, drop_last: bool, shuffle: bool):
-    """Constructs a :class:`~torch.utils.data.distributed.DistributedSampler` for a dataset. The
-    :class:`~torch.utils.data.distributed.DistributedSampler` assumes that each rank has a complete copy of the dataset.
-    It ensures that each rank sees a unique shard for each epoch containing ``len(datset) / get_world_size()`` samples.
+    """Constructs a :class:`~torch.utils.data.distributed.DistributedSampler` for a dataset.
+
+    The :class:`~torch.utils.data.distributed.DistributedSampler` assumes that each rank has a complete copy of the
+    dataset. It ensures that each rank sees a unique shard for each epoch containing
+    ``len(dataset) / get_world_size()`` samples.
 
     .. note::
 
-        If the ``dataset`` is already shareded by rank, use a :class:`~torch.utils.data.SequentialSampler`
+        If the ``dataset`` is already sharded by rank, use a :class:`~torch.utils.data.SequentialSampler`
         or :class:`~torch.utils.data.RandomSampler`.
 
     Args:
diff --git a/composer/utils/file_helpers.py b/composer/utils/file_helpers.py
index cf3dfce9f7..774321849f 100644
--- a/composer/utils/file_helpers.py
+++ b/composer/utils/file_helpers.py
@@ -69,9 +69,11 @@ def ensure_folder_is_empty(folder_name: Union[str, pathlib.Path]):
 
 
 def ensure_folder_has_no_conflicting_files(folder_name: Union[str, pathlib.Path], filename: str, timestamp: Timestamp):
-    """Ensure that the given folder does not have any files conflicting with the ``filename`` format string. If any
-    filename is formatted with a timestamp where the epoch, batch, sample, or token counts are after ``timestamp``, a
-    ``FileExistsError`` will be raised. ``filename`` and occurs later than ``timestamp``, raise a ``FileExistsError``.
+    """Ensure that the given folder does not have any files conflicting with the ``filename`` format string.
+
+    If any filename is formatted with a timestamp where the epoch, batch, sample, or token counts are after
+    ``timestamp``, a ``FileExistsError`` will be raised.
+    If ``filename`` and occurs later than ``timestamp``, raise a ``FileExistsError``.
 
     Args:
         folder_name (str | pathlib.Path): The folder to inspect.
@@ -154,7 +156,7 @@ def ensure_folder_has_no_conflicting_files(folder_name: Union[str, pathlib.Path]
 """
 
 
-def format_name_with_dist(format_str: str, run_name: str, **extra_format_kwargs: object):
+def format_name_with_dist(format_str: str, run_name: str, **extra_format_kwargs: object):  # noqa: D103
     formatted_str = format_str.format(
         run_name=run_name,
         rank=dist.get_global_rank(),
@@ -246,7 +248,12 @@ def format_name_with_dist(format_str: str, run_name: str, **extra_format_kwargs:
 """
 
 
-def format_name_with_dist_and_time(format_str: str, run_name: str, timestamp: Timestamp, **extra_format_kwargs: object):
+def format_name_with_dist_and_time(
+    format_str: str,
+    run_name: str,
+    timestamp: Timestamp,
+    **extra_format_kwargs: object,
+):  # noqa: D103
     formatted_str = format_str.format(
         run_name=run_name,
         rank=dist.get_global_rank(),
diff --git a/composer/utils/import_helpers.py b/composer/utils/import_helpers.py
index 01f5efdedd..2448fd112b 100644
--- a/composer/utils/import_helpers.py
+++ b/composer/utils/import_helpers.py
@@ -10,16 +10,16 @@
 
 
 class MissingConditionalImportError(ImportError):
+    """Handles errors for external packages that might not be installed.
+
+    Args:
+        extra_deps_group (str): the pip package group, found in setup.py. For example, nlp for `mosaicml[nlp]`.
+        conda_package (str, optional): The package(s) to install if using conda.
+        conda_channel (str, optional): The conda channel to install packages from. Set to ``None`` if the
+            package is not published on conda and must be installed via pip.
+    """
 
     def __init__(self, extra_deps_group: str, conda_package: str, conda_channel: Optional[str] = 'conda-forge'):
-        """Handles errors for external packages that might not be installed.
-
-        Args:
-            extra_deps_group (str): the pip package group, found in setup.py. For example, nlp for `mosaicml[nlp]`.
-            conda_package (str, optional): The package(s) to install if using conda.
-            conda_channel (str, optional): The conda channel to install packages from. Set to ``None`` if the
-                package is not published on conda and must be installed via pip.
-        """
         if conda_channel:
             conda_command = f"conda install -c {conda_channel} {conda_package}"
         else:
diff --git a/composer/utils/iter_helpers.py b/composer/utils/iter_helpers.py
index c4f1aab79e..1414733c26 100644
--- a/composer/utils/iter_helpers.py
+++ b/composer/utils/iter_helpers.py
@@ -11,7 +11,7 @@
 
 
 def map_collection(collection, map_fn):
-    """Apply ``map_fn`` on each element in ``collection``.
+    """Applies ``map_fn`` on each element in ``collection``.
 
     * If ``collection`` is a tuple or list of elements, ``map_fn`` is applied on each element,
       and a tuple or list, respectively, containing mapped values is returned.
diff --git a/composer/utils/libcloud_object_store.py b/composer/utils/libcloud_object_store.py
index c29ccbfd1a..b62c0276e5 100644
--- a/composer/utils/libcloud_object_store.py
+++ b/composer/utils/libcloud_object_store.py
@@ -142,7 +142,9 @@ def upload_object_via_stream(self,
                                                 headers=headers)
 
     def _get_object(self, object_name: str):
-        """Get object from object store. Recursively follow any symlinks. If an object does not exist, automatically
+        """Get object from object store.
+
+        Recursively follow any symlinks. If an object does not exist, automatically
         checks if it is a symlink by appending ``.symlink``.
 
         Args:
diff --git a/composer/utils/libcloud_object_store_hparams.py b/composer/utils/libcloud_object_store_hparams.py
index 7c012860e2..f55c15025a 100644
--- a/composer/utils/libcloud_object_store_hparams.py
+++ b/composer/utils/libcloud_object_store_hparams.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Hyperparameters for the :class:`~.LibcloudObjectStore`."""
+
 import dataclasses
 import os
 from typing import Any, Dict, Optional
@@ -12,7 +14,7 @@
 
 @dataclasses.dataclass
 class LibcloudObjectStoreHparams(hp.Hparams):
-    """:class:`~composer.utils.libcloud_object_store.LibcloudObjectStore` hyperparameters.
+    """:class:`~.LibcloudObjectStore` hyperparameters.
 
     .. rubric:: Example
 
@@ -145,7 +147,6 @@ def initialize_object(self):
         Returns:
             LibcloudObjectStore: The object_store.
         """
-
         return LibcloudObjectStore(
             provider=self.provider,
             container=self.container,
diff --git a/composer/utils/module_surgery.py b/composer/utils/module_surgery.py
index 99c106f1e1..fb9469f59b 100644
--- a/composer/utils/module_surgery.py
+++ b/composer/utils/module_surgery.py
@@ -18,8 +18,7 @@
             module (torch.nn.Module): Source module
             module_index (int): The i-th instance of module class.
 
-        Returns:
-            Optional[torch.nn.Module]: The replacement module, or ``None`` to indicate no modification.
+        Returns: Optional[torch.nn.Module]: The replacement module, or ``None`` to indicate no modification.
 """
 import collections
 import itertools
@@ -238,8 +237,9 @@ def count_module_instances(module: torch.nn.Module, module_class: Union[Type[tor
 
 
 def _tensor_in(tensor: torch.Tensor, iterable: Iterable[torch.Tensor]):
-    """Returns whether `tensor is element` for any element in `iterable` This function is necessary because `tensor in
-    iterable` does not work reliably for `Tensor`s.
+    """Returns whether ``tensor is element`` for any element in ``iterable``.
+
+    This function is necessary because ``tensor in iterable`` does not work reliably for :class:`.Tensor` objects.
 
     See https://discuss.pytorch.org/t/how-to-judge-a-tensor-is-in-a-list/15998/4
     for further discussion.
@@ -248,7 +248,8 @@ def _tensor_in(tensor: torch.Tensor, iterable: Iterable[torch.Tensor]):
 
 
 def _find_param_in_optimizer(param: torch.nn.parameter.Parameter, optimizer: Optimizer) -> int:
-    """Returns the index of the optimizer ``param_group`` containing ``param``
+    """Returns the index of the optimizer ``param_group`` containing ``param``.
+
     Optimizers store their parameters within an iterable of ``dict``s called
     :attr:`~torch.optim.Optimizer.param_groups`.
     By default, there is only one group in :attr:`~torch.optim.Optimizer.param_groups`
@@ -278,19 +279,19 @@ def _find_param_in_optimizer(param: torch.nn.parameter.Parameter, optimizer: Opt
 def update_params_in_optimizer(old_params: Iterable[torch.nn.parameter.Parameter],
                                new_params: Iterable[torch.nn.parameter.Parameter],
                                optimizers: Union[Optimizer, Sequence[Optimizer]]) -> None:
-    """Remove ``old_params`` from the ``optimizers`` and insert ``new_params``.
+    r"""Remove ``old_params`` from the ``optimizers`` and insert ``new_params``.
 
     Newly added parameters will be added to the same :attr:`~torch.optim.Optimizer.param_group` as the removed
     parameters. A :class:`RuntimeError` will be raised if ``old_params`` is split across multiple parameter groups.
 
     This function differs from :meth:`replace_params_in_optimizer` in that ``len(old_params)`` need not equal
-    ``len(new_params)``. However, this function does not support replacing parameters accross multiple optimizer
+    ``len(new_params)``. However, this function does not support replacing parameters across multiple optimizer
     groups.
 
     .. warning::
 
         Dynamically removing parameters from a :class:`~torch.optim.Optimizer` and adding parameters
-        to an existing :attr:`~torch.optim.Optimizer.param_group`\\s are not officially supported, so this
+        to an existing :attr:`~torch.optim.Optimizer.param_group`\s are not officially supported, so this
         function may fail when PyTorch is updated. The
         `recommended practice <https://github.com/pytorch/pytorch/issues/1489#issuecomment-355301737>`_ is
         to instead recreate the optimizer when the parameter set changes
diff --git a/composer/utils/reproducibility.py b/composer/utils/reproducibility.py
index 3e59377417..b52cb6675b 100644
--- a/composer/utils/reproducibility.py
+++ b/composer/utils/reproducibility.py
@@ -39,7 +39,6 @@
 
     .. testcleanup::
 
-        trainer.engine.close()
         warnings.resetwarnings()
 
 Attributes:
@@ -95,7 +94,6 @@ def configure_deterministic_mode():
 
         .. testcleanup::
 
-            trainer.engine.close()
             warnings.resetwarnings()
 
         However, to configure deterministic mode for operations before the trainer is initialized, manually invoke this
@@ -147,10 +145,6 @@ def seed_all(seed: int):
 
             >>> trainer = Trainer(seed=42)
 
-        .. testcleanup::
-
-            trainer.engine.close()
-
         However, to configure the random seed for operations before the trainer is initialized, manually invoke this
         function at the beginning of your training script.
 
@@ -173,7 +167,6 @@ def get_rng_state() -> List[Dict[str, Any]]:
     Returns:
         List[Dict[str, Any]]: A list of RNG State Dicts, indexed by global rank.
     """
-
     rng_state = {
         "python": random.getstate(),
         "numpy": np.random.get_state(),
diff --git a/docker/pytorch/generate_build_matrix.py b/docker/pytorch/generate_build_matrix.py
index 80d96b98ee..67fc93044c 100644
--- a/docker/pytorch/generate_build_matrix.py
+++ b/docker/pytorch/generate_build_matrix.py
@@ -1,14 +1,13 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""
-Helper script to generate the build_matrix.yaml
+"""Helper script to generate the ``build_matrix.yaml`` and update ``README.md``.
 
-Note: this script requires tabulate. Run `pip install tabulate` if not installed
+Note: this script requires tabulate. Run ``pip install tabulate`` if not installed
 
-To run: python generate_build_matrix.py
+To run::
 
-Also update the `README.md` in the docker folder with the resulting table.
+    python generate_build_matrix.py
 """
 
 import itertools
@@ -20,7 +19,7 @@
 import yaml
 
 
-def get_pytorch_version(python_version: str):
+def _get_pytorch_version(python_version: str):
     if python_version == "3.9":
         return "1.11.0"
     if python_version in "3.8":
@@ -30,7 +29,7 @@ def get_pytorch_version(python_version: str):
     raise ValueError(f"Invalid python version: {python_version}")
 
 
-def get_torchvision_version(pytorch_version: str):
+def _get_torchvision_version(pytorch_version: str):
     if pytorch_version == "1.10.2":
         return "0.11.3"
     if pytorch_version == "1.11.0":
@@ -40,7 +39,7 @@ def get_torchvision_version(pytorch_version: str):
     raise ValueError(f"Invalid pytorch_version: {pytorch_version}")
 
 
-def get_base_image(cuda_version: str):
+def _get_base_image(cuda_version: str):
     if cuda_version == "cpu":
         return "ubuntu:20.04"
     if cuda_version == "11.1.1":
@@ -50,7 +49,7 @@ def get_base_image(cuda_version: str):
     raise ValueError(f"Invalid cuda_version: {cuda_version}")
 
 
-def get_cuda_version(pytorch_version: str, use_cuda: bool):
+def _get_cuda_version(pytorch_version: str, use_cuda: bool):
     if not use_cuda:
         return "cpu"
     if pytorch_version == "1.9.1":
@@ -60,7 +59,7 @@ def get_cuda_version(pytorch_version: str, use_cuda: bool):
     raise ValueError(f"Invalid pytorch_version: {str}")
 
 
-def get_cuda_version_tag(cuda_version: str):
+def _get_cuda_version_tag(cuda_version: str):
     if cuda_version == "cpu":
         return "cpu"
     if cuda_version == "11.1.1":
@@ -70,7 +69,7 @@ def get_cuda_version_tag(cuda_version: str):
     raise ValueError(f"Invalid cuda_version: {cuda_version}")
 
 
-def get_tags(python_version: str, pytorch_version: str, cuda_version_tag: str, cuda_version: str, stage: str):
+def _get_tags(python_version: str, pytorch_version: str, cuda_version_tag: str, cuda_version: str, stage: str):
     if stage == "pytorch_stage":
         base_image_name = "mosaicml/pytorch"
     elif stage == "vision_stage":
@@ -88,7 +87,7 @@ def get_tags(python_version: str, pytorch_version: str, cuda_version_tag: str, c
     return tags
 
 
-def main():
+def _main():
     python_versions = ["3.7", "3.8", "3.9"]
     cuda_options = [True, False]
     stages = ["pytorch_stage", "vision_stage"]
@@ -98,13 +97,13 @@ def main():
     for product in itertools.product(python_versions, cuda_options, stages):
         python_version, use_cuda, stage = product
 
-        pytorch_version = get_pytorch_version(python_version)
-        cuda_version = get_cuda_version(pytorch_version=pytorch_version, use_cuda=use_cuda)
-        cuda_version_tag = get_cuda_version_tag(cuda_version)
+        pytorch_version = _get_pytorch_version(python_version)
+        cuda_version = _get_cuda_version(pytorch_version=pytorch_version, use_cuda=use_cuda)
+        cuda_version_tag = _get_cuda_version_tag(cuda_version)
 
         entry = {
             "BASE_IMAGE":
-                get_base_image(cuda_version),
+                _get_base_image(cuda_version),
             "CUDA_VERSION":
                 cuda_version,
             "CUDA_VERSION_TAG":
@@ -118,9 +117,9 @@ def main():
             "TARGET":
                 stage,
             "TORCHVISION_VERSION":
-                get_torchvision_version(pytorch_version),
+                _get_torchvision_version(pytorch_version),
             "TAGS":
-                get_tags(
+                _get_tags(
                     python_version=python_version,
                     pytorch_version=pytorch_version,
                     cuda_version_tag=cuda_version_tag,
@@ -184,4 +183,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    _main()
diff --git a/docker/pytorch/pillow_stub/setup.py b/docker/pytorch/pillow_stub/setup.py
index 835d501d9d..502b19b117 100644
--- a/docker/pytorch/pillow_stub/setup.py
+++ b/docker/pytorch/pillow_stub/setup.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Setup script for the stub pillow."""
+
 import os
 
 from setuptools import setup
diff --git a/docs/source/conf.py b/docs/source/conf.py
index fc8cfeea97..b3441c5509 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,19 +1,19 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
+"""Configuration file for the Sphinx documentation builder.
+
+This file only contains a selection of the most common options. For a full
+list see the documentation:
+https://www.sphinx-doc.org/en/master/usage/configuration.html
 
-# -- Path setup --------------------------------------------------------------
+-- Path setup --------------------------------------------------------------
 
+If extensions (or modules to document with autodoc) are in another directory,
+add these directories to sys.path here. If the directory is relative to the
+documentation root, use os.path.abspath to make it absolute, like shown here.
+"""
 import ast
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
 import importlib
 import inspect
 import json
@@ -92,7 +92,7 @@
 
 
 def _get_commit_sha() -> str:
-    """Determine the commit sha.
+    """Determines the commit sha.
 
     Returns:
         str: The git commit sha, as a string.
@@ -253,7 +253,7 @@ def skip_redundant_namedtuple_attributes(
     skip: bool,
     options: sphinx.ext.autodoc.Options,
 ):
-    # Hide the default, duplicate attributes for named tuples
+    """Hide the default, duplicate attributes for named tuples."""
     del app, what, name, skip, options
     if '_tuplegetter' in obj.__class__.__name__:
         return True
@@ -279,7 +279,6 @@ def determine_sphinx_path(item: Union[Type[object], Type[BaseException], types.M
         or could be a very unlikely edge condition where a public item in a private module is reimported only by
         sibling module(s), not any (grand)parents.
     """
-
     # Check to see if `item` is itself private
     if item.__name__.startswith("_"):
         public_name = item.__name__
@@ -320,7 +319,7 @@ def add_module_summary_tables(
     options: sphinx.ext.autodoc.Options,
     lines: List[str],
 ):
-    """This hook adds in summary tables for each module, documenting all functions, exceptions, classes, and attributes.
+    """Add summary tables for each module, documenting all functions, exceptions, classes, and attributes.
 
     It links reimported imports to their original source, as not to create a duplicate, indexed toctree entry.
     It automatically inserts itself at the end of each module docstring.
@@ -430,9 +429,7 @@ def add_module_summary_tables(
 
 
 def rstjinja(app, docname, source):
-    """
-    Render our pages as a jinja template for fancy templating goodness.
-    """
+    """Render our pages as a jinja template for fancy templating goodness."""
     # Make sure we're outputting HTML
     if app.builder.format != 'html':
         return
@@ -442,6 +439,7 @@ def rstjinja(app, docname, source):
 
 
 def get_algorithms_metadata() -> Dict[str, Dict[str, str]]:
+    """Get the metadata for algorithms from the ``metadata.json`` files."""
     EXCLUDE = ['no_op_model']
 
     root = os.path.join(os.path.dirname(__file__), '..', '..', 'composer', 'algorithms')
@@ -479,7 +477,7 @@ def get_algorithms_metadata() -> Dict[str, Dict[str, str]]:
 line_to_delete = _('Bases: %s') % u':py:class:`object`'
 
 
-def add_line_no_object_base(self, text, *args, **kwargs):
+def _add_line_no_object_base(self, text, *args, **kwargs):
     if text.strip() == line_to_delete:
         return
 
@@ -489,8 +487,9 @@ def add_line_no_object_base(self, text, *args, **kwargs):
 add_directive_header = ClassDocumenter.add_directive_header
 
 
-def add_directive_header_no_object_base(self, *args, **kwargs):
-    self.add_line = add_line_no_object_base.__get__(self)
+def _add_directive_header_no_object_base(self, *args, **kwargs):
+    """Hide that all classes inherit from the base class ``object``."""
+    self.add_line = _add_line_no_object_base.__get__(self)
 
     result = add_directive_header(self, *args, **kwargs)
 
@@ -499,7 +498,7 @@ def add_directive_header_no_object_base(self, *args, **kwargs):
     return result
 
 
-ClassDocumenter.add_directive_header = add_directive_header_no_object_base
+ClassDocumenter.add_directive_header = _add_directive_header_no_object_base
 
 
 def _recursive_getattr(obj: Any, path: str):
@@ -531,6 +530,7 @@ def _determine_lineno_of_attribute(module: types.ModuleType, attribute: str):
 
 
 def linkcode_resolve(domain: str, info: Dict[str, str]):
+    """Adds links to the GitHub source code in the API Reference."""
     assert domain == "py", f"unsupported domain: {domain}"
     module_name = info['module']
 
@@ -605,6 +605,7 @@ def visit_reference(self, node: Element) -> None:
 
 
 def setup(app: sphinx.application.Sphinx):
+    """Setup hook."""
     app.connect('autodoc-skip-member', skip_redundant_namedtuple_attributes)
     app.connect('autodoc-process-docstring', add_module_summary_tables)
     app.connect('source-read', rstjinja)
diff --git a/docs/source/doctest_cleanup.py b/docs/source/doctest_cleanup.py
index adc827a052..297ea89152 100644
--- a/docs/source/doctest_cleanup.py
+++ b/docs/source/doctest_cleanup.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Cleanup script that is executed at the end of each doctest."""
+
 import os
 
 # variables are defined in doctest_fixtures.py
diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index a412f9f7ff..49c54b8f6d 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -4,8 +4,7 @@
 # disabling general type issues because of monkeypatching
 #yright: reportGeneralTypeIssues=none
 
-"""
-Fixtures available in doctests.
+"""Fixtures available in doctests.
 
 The script is run before any doctests are executed,
 so all imports and variables are available in any doctest.
@@ -134,6 +133,7 @@
 
 
 def loss_fun(output, target, reduction="none"):
+    """Dummy loss function."""
     return torch.ones_like(target)
 
 
diff --git a/docs/source/getting_started/welcome_tour.rst b/docs/source/getting_started/welcome_tour.rst
index a28c85fa9a..00f72c2e49 100644
--- a/docs/source/getting_started/welcome_tour.rst
+++ b/docs/source/getting_started/welcome_tour.rst
@@ -110,7 +110,7 @@ simple:
 
     class MixUp(Algorithm):
         def match(self, event: Event, state: State) -> bool:
-            """Determine whether the algorithm should run on a given event."""
+            """Determines whether the algorithm should run on a given event."""
             return event in [Event.AFTER_DATALOADER, Event.AFTER_LOSS]
 
         def apply(self, event: Event, state: State, logger: Logger) -> None:
diff --git a/docs/source/tables/__init__.py b/docs/source/tables/__init__.py
index cba5725129..b30b18379a 100644
--- a/docs/source/tables/__init__.py
+++ b/docs/source/tables/__init__.py
@@ -1,2 +1,4 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
+
+"""Table helpers for composer docs."""
diff --git a/docs/source/tables/utils.py b/docs/source/tables/utils.py
index a8eee76d03..8b3bfa37ad 100644
--- a/docs/source/tables/utils.py
+++ b/docs/source/tables/utils.py
@@ -10,6 +10,7 @@
 
 def list_dirs(folder):
     """Lists all dirs for a given folder.
+
     Args:
         folder (str): The folder to list dirs for.
     """
@@ -19,12 +20,12 @@ def list_dirs(folder):
 
 def assert_attributes_exist(name, module_dict, attributes):
     """Assert that module has the provided attributes.
+
     Args:
         name (str): The class name.
         module_dict (dict): The dict form of the class.
         attributes (list): The attributes to check for.
     """
-
     for attribute in attributes:
         assert attribute in module_dict, \
         f"{name} should define {attribute} in its __init__.py file."
@@ -32,11 +33,14 @@ def assert_attributes_exist(name, module_dict, attributes):
 
 def get_metadata(names, attributes, module_basepath):
     """Returns a nested dict of metadata with names as keys.
+
     Checks that all attributes exist in module given by module_basepath.name.
+
     Args:
         names (str): The module names.
         attributes (list): The attributes to check for.
         module_basepath (str): The import path of the module.
+
     Example::
         >>> get_metadata(
                 names=['blurpool', 'label_smoothing'],
@@ -78,13 +82,16 @@ def get_metadata(names, attributes, module_basepath):
 
 def build_markdown_table(header, metadata, sorted_keys, row_format):
     """Builds a markdown table, formatting `row_format` with the `metadata`.
+
     Entries in the table are ordered by `sorted_keys`.
+
     Args:
         header (list): list of header strings
         metadata (dict): nested dict of metadata
         sorted_keys (list): order of rows in table
         row_format (list): list of length(header). Elements are either a string
-                           or a single-argument callable that returns a string.
+            or a single-argument callable that returns a string.
+
     Returns:
         table_md (list): table in markdown format
     """
@@ -109,6 +116,7 @@ def _print_row(row):
 
 def index_tag_in_lines(lines, tag):
     """Returns line number where tag is found.
+
     Args:
         lines (list): List of lines to check.
         tag (str): Tag to find.
@@ -121,10 +129,12 @@ def index_tag_in_lines(lines, tag):
 
 def update_table_in_file(table, source_file):
     """Updates the table content based on a source file.
+
     Given a `source file`, updates the table. Searches
     the file for 'Table Start' and 'Table End' tags, and replaces
     the content between those tags.
     The original file is retained with the `.bkp` suffix.
+
     Args:
         table (list): list of strings
         source_file (path): path to source file
diff --git a/docs/source/trainer/distributed_training.rst b/docs/source/trainer/distributed_training.rst
index c39e7a53a1..107a719a24 100644
--- a/docs/source/trainer/distributed_training.rst
+++ b/docs/source/trainer/distributed_training.rst
@@ -48,7 +48,7 @@ For additional configurations of our launcher script, run ``composer --help``.
 
 .. argparse::
    :module: composer.cli.launcher
-   :func: get_parser
+   :func: _get_parser
    :prog: composer
    :nodescription:
 
diff --git a/docs/source/trainer/using_the_trainer.rst b/docs/source/trainer/using_the_trainer.rst
index b66506a5bd..17f06f91f8 100644
--- a/docs/source/trainer/using_the_trainer.rst
+++ b/docs/source/trainer/using_the_trainer.rst
@@ -187,10 +187,6 @@ argument.
     # points of the training loop
     trainer.fit()
 
-.. testcleanup::
-
-    trainer.engine.close()
-
 We handle inserting algorithms into the training loop and in the right order.
 
 .. seealso::
diff --git a/examples/checkpoint_with_wandb.py b/examples/checkpoint_with_wandb.py
index de3ca0dc40..9c864e75c9 100644
--- a/examples/checkpoint_with_wandb.py
+++ b/examples/checkpoint_with_wandb.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Save and Load Checkpoints with `Weights and Biases <https://wandb.ai/>`."""
+
 import shutil
 
 import torch.utils.data
diff --git a/examples/custom_models.py b/examples/custom_models.py
index 4eec03b4f6..3f64fc45a2 100644
--- a/examples/custom_models.py
+++ b/examples/custom_models.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Example for training with an algorithm on a custom model."""
+
 import torch
 import torch.utils.data
 from torchvision import datasets, transforms
@@ -13,6 +15,7 @@
 
 # Your custom model
 class SimpleModel(composer.models.ComposerClassifier):
+    """Your custom model."""
 
     def __init__(self, num_hidden: int, num_classes: int):
         module = torch.nn.Sequential(
diff --git a/examples/profiler_demo.py b/examples/profiler_demo.py
index 3c0909904a..229ddf090c 100644
--- a/examples/profiler_demo.py
+++ b/examples/profiler_demo.py
@@ -1,6 +1,11 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Profiling Example.
+
+For a walk-through of this example, please see the `profiling guide</trainer/performance_tutorials/profiling>`_.
+"""
+
 # [imports-start]
 import torch
 from torch.utils.data import DataLoader
diff --git a/examples/run_composer_trainer.py b/examples/run_composer_trainer.py
index 7cf06b70cb..00a13b68f1 100644
--- a/examples/run_composer_trainer.py
+++ b/examples/run_composer_trainer.py
@@ -23,13 +23,13 @@
 from composer.utils import dist
 
 
-def warning_on_one_line(message: str, category: Type[Warning], filename: str, lineno: int, file=None, line=None):
+def _warning_on_one_line(message: str, category: Type[Warning], filename: str, lineno: int, file=None, line=None):
     # From https://stackoverflow.com/questions/26430861/make-pythons-warnings-warn-not-mention-itself
     return f'{category.__name__}: {message} (source: {filename}:{lineno})\n'
 
 
-def main() -> None:
-    warnings.formatwarning = warning_on_one_line
+def _main() -> None:
+    warnings.formatwarning = _warning_on_one_line
 
     if len(sys.argv) == 1:
         sys.argv = [sys.argv[0], "--help"]
@@ -67,4 +67,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    _main()
diff --git a/pyproject.toml b/pyproject.toml
index cc72c0ab83..c4b039ba7a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1013,3 +1013,8 @@ min-public-methods=2
 # Exceptions that will emit a warning when being caught. Defaults to
 # "BaseException, Exception".
 overgeneral-exceptions="BaseException,Exception"
+
+[tool.pydocstyle]
+convention="google"
+add_ignore="D102,D105,D107,D401"
+add_select="D400,D404"
diff --git a/scripts/ffcv/create_ffcv_datasets.py b/scripts/ffcv/create_ffcv_datasets.py
index be916774ff..4fb73561b4 100644
--- a/scripts/ffcv/create_ffcv_datasets.py
+++ b/scripts/ffcv/create_ffcv_datasets.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Helper utilities to create FFCV datasets."""
+
 import logging
 import os
 import sys
@@ -15,7 +17,7 @@
 log = logging.getLogger(__name__)
 
 
-def get_parser():
+def _get_parser():
     parser = ArgumentParser(description="Utility for converting datasets to ffcv format.")
 
     parser.add_argument("--dataset",
@@ -81,8 +83,8 @@ def get_parser():
     return parser
 
 
-def parse_args():
-    parser = get_parser()
+def _parse_args():
+    parser = _get_parser()
 
     args = parser.parse_args()
 
@@ -100,9 +102,9 @@ def parse_args():
     return args
 
 
-def main():
+def _main():
 
-    args = parse_args()
+    args = _parse_args()
 
     ds = None
     remote_location = None
@@ -131,4 +133,4 @@ def main():
 
 
 if __name__ == '__main__':
-    sys.exit(main())
+    sys.exit(_main())
diff --git a/scripts/mds/ade20k.py b/scripts/mds/ade20k.py
index aee8ee3d6a..e963c14d81 100644
--- a/scripts/mds/ade20k.py
+++ b/scripts/mds/ade20k.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""ADE20K streaming dataset conversion scripts."""
+
 import os
 import random
 from argparse import ArgumentParser, Namespace
@@ -31,7 +33,6 @@ def get(in_root: str, split: str, shuffle: bool) -> List[Tuple[str, str, str]]:
     Returns:
         List of samples of (uid, image_filename, annotation_filename).
     """
-
     # Get uids
     image_glob_pattern = f'{in_root}/images/{split}/ADE_{split}_*.jpg'
     images = sorted(glob(image_glob_pattern))
diff --git a/scripts/mds/coco.py b/scripts/mds/coco.py
index 78402125db..ac7f5652d6 100644
--- a/scripts/mds/coco.py
+++ b/scripts/mds/coco.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Coco streaming dataset conversion scripts."""
+
 import os
 from argparse import ArgumentParser, Namespace
 from typing import Dict, Iterable
diff --git a/scripts/mds/imagenet1k.py b/scripts/mds/imagenet1k.py
index 08abea327a..51a0421d35 100644
--- a/scripts/mds/imagenet1k.py
+++ b/scripts/mds/imagenet1k.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""ImageNet 1K Streaming Dataset Conversion Script."""
+
 import os
 import random
 from argparse import ArgumentParser, Namespace
@@ -70,12 +72,11 @@ def each(pairs: List[Tuple[str, int]]) -> Iterable[Dict[str, Any]]:
 
 
 def main(args: Namespace) -> None:
-    """Main: create ImageNet1k streaming dataset.
+    """Create ImageNet1k streaming dataset.
 
     Args:
         args (Namespace): Commandline arguments.
     """
-
     fields = ['uid', 'x', 'y']
 
     for (split, expected_num_samples, shuffle) in [
diff --git a/setup.py b/setup.py
index 647eb70731..b00932bdef 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Composer package setup."""
+
 import os
 import site
 import sys
@@ -17,6 +19,7 @@
 
 # From https://stackoverflow.com/questions/51292333/how-to-tell-from-setup-py-if-the-module-is-being-installed-in-editable-mode
 class develop(develop_orig):
+    """Override the ``develop`` class to error if attempting an editable install as root."""
 
     def run(self):
         if _IS_ROOT and (not _IS_VIRTUALENV) and (not _IS_USER):
@@ -34,6 +37,7 @@ def run(self):
 
 
 def package_files(prefix: str, directory: str, extension: str):
+    """Get all the files to package."""
     # from https://stackoverflow.com/a/36693250
     paths = []
     for (path, _, filenames) in os.walk(os.path.join(prefix, directory)):
diff --git a/tests/algorithms/test_colout.py b/tests/algorithms/test_colout.py
index b1bb7c9f3b..3ac9e73136 100644
--- a/tests/algorithms/test_colout.py
+++ b/tests/algorithms/test_colout.py
@@ -257,7 +257,7 @@ def test_match_incorrect(self, event: Event, colout_algorithm: ColOut, minimal_s
     @pytest.mark.parametrize("batch", [True])
     def test_apply_batch(self, fake_image_batch: torch.Tensor, colout_algorithm: ColOut, minimal_state: State,
                          empty_logger: Logger):
-        """Apply the algorithm to a fake batch."""
+        """Applies the algorithm to a fake batch."""
         p_row = colout_algorithm.p_row
         p_col = colout_algorithm.p_col
 
@@ -270,7 +270,7 @@ def test_apply_batch(self, fake_image_batch: torch.Tensor, colout_algorithm: Col
     @pytest.mark.parametrize("batch", [True])
     def test_apply_batch_pair(self, fake_image_batch: torch.Tensor, colout_algorithm: ColOut, minimal_state: State,
                               empty_logger: Logger):
-        """Apply batch ColOut to 2-tuple of images."""
+        """Applies batch ColOut to 2-tuple of images."""
         p_row = colout_algorithm.p_row
         p_col = colout_algorithm.p_col
 
diff --git a/tests/algorithms/test_progressive_resizing.py b/tests/algorithms/test_progressive_resizing.py
index 4eace53d3d..36a5c0dc82 100644
--- a/tests/algorithms/test_progressive_resizing.py
+++ b/tests/algorithms/test_progressive_resizing.py
@@ -105,26 +105,22 @@ def test_without_target(self, X, y):
     @pytest.mark.parametrize("Wx,Hx", [(31, 31), (32, 32), (32, 16)])
     def test_resize_batch_shape(self, X: torch.Tensor, y: torch.Tensor, mode: str, scale_factor: float):
         """Test scaling works for different input shapes."""
-
         Xc, _ = resize_batch(X, y, scale_factor, mode, resize_targets=False)
         assert check_scaled_shape(X, Xc, scale_factor)
 
     def test_resize_outputs_shape(self, X: torch.Tensor, y: torch.Tensor, mode: str, scale_factor: float):
         """Test that resizing outputs works."""
-
         _, yc = resize_batch(X, y, scale_factor, mode, resize_targets=True)
         assert check_scaled_shape(y, yc, scale_factor)
 
     def test_resize_outputs_crop(self, X: torch.Tensor, scale_factor: float):
         """Test that resizing outputs in crop mode gives the right targets."""
-
         xc, yc = resize_batch(X, X, scale_factor, "crop", resize_targets=True)
         assert torch.equal(xc, yc)
 
     @pytest.mark.parametrize("Wx,Hx,Wy,Hy", [(32, 32, 16, 16)])
     def test_resize_outputs_different_shape(self, X, y, scale_factor: float, mode: str):
         """Test that resizing works when X and y have different shapes."""
-
         _, yc = resize_batch(X, y, scale_factor, mode, resize_targets=True)
         assert check_scaled_shape(y, yc, scale_factor)
 
diff --git a/tests/loggers/test_file_logger.py b/tests/loggers/test_file_logger.py
index 2fa566fe3d..0175de930a 100644
--- a/tests/loggers/test_file_logger.py
+++ b/tests/loggers/test_file_logger.py
@@ -27,7 +27,7 @@ def log_file_artifact(self, state: State, log_level: LogLevel, artifact_name: st
 
 
 @pytest.mark.parametrize("log_level", [LogLevel.EPOCH, LogLevel.BATCH])
-@pytest.mark.timeout(10)
+@pytest.mark.timeout(30)
 def test_file_logger(dummy_state: State, log_level: LogLevel, tmp_path: pathlib.Path):
     log_file_name = os.path.join(tmp_path, "output.log")
     log_destination = FileLogger(
diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
index 9d3d8ad235..2ee22aa239 100644
--- a/tests/trainer/test_checkpoint.py
+++ b/tests/trainer/test_checkpoint.py
@@ -531,6 +531,7 @@ def test_checkpoint(
 
     second_trainer_hparams = copy.deepcopy(composer_trainer_hparams)
     first_trainer = _test_checkpoint_trainer(composer_trainer_hparams)
+    dist.barrier()  # Ensure all ranks wrote the checkpoint file
     save_interval_time = Time.from_timestring(save_interval)
     if save_interval_time.unit == TimeUnit.EPOCH:
         expected_num_checkpoints = ((num_epochs - 1) // save_interval_time.value) + 1
@@ -575,6 +576,7 @@ def test_checkpoint(
     second_trainer_hparams.load_strict_model_weights = False
 
     _test_checkpoint_trainer(second_trainer_hparams)
+    dist.barrier()  # Ensure all ranks wrote the checkpoint file
     second_trainer_final_checkpoint_filepath = os.path.join(checkpoint_b_folder, final_checkpoint)
 
     assert_checkpoints_equivalent(