allenai · dirkgr · Nov 5, 2018 · Dec 10, 2018 · Jan 1, 2019 · Apr 15, 2019
diff --git a/allennlp/common/util.py b/allennlp/common/util.py
@@ -91,7 +91,7 @@ def sanitize(x: Any) -> Any:
 def group_by_count(iterable: List[Any], count: int, default_value: Any) -> List[List[Any]]:
     """
     Takes a list and groups it into sublists of size ``count``, using ``default_value`` to pad the
-    list at the end if the list is not divisable by ``count``.
+    list at the end if the list is not divisible by ``count``.
 
     For example:
     >>> group_by_count([1, 2, 3, 4, 5, 6, 7], 3, 0)

diff --git a/allennlp/tests/training/trainer_test.py b/allennlp/tests/training/trainer_test.py
@@ -5,6 +5,7 @@
 import re
 import time
 from typing import Dict
+from pathlib import Path
 
 import torch
 import pytest
@@ -832,6 +833,32 @@ def test_restoring_works_with_older_checkpointing(self):
         assert trainer._metric_tracker._best_so_far == 0.1
         assert trainer._metric_tracker._epochs_with_no_improvement == 1
 
+    def test_trainer_can_run_gradient_accumulation(self):
+        with Path(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv").open() as input_file:
+            num_training_instances = sum(1 for i in input_file)
+
+        steps_to_accumulate = 2
+
+        trainer = Trainer(
+            self.model,
+            self.optimizer,
+            self.iterator,
+            self.instances,
+            validation_dataset=self.instances,
+            num_epochs=2,
+            num_gradient_accumulation_steps=steps_to_accumulate,
+        )
+        assert trainer._num_gradient_accumulation_steps == steps_to_accumulate
+
+        metrics = trainer.train()
+
+        num_batches_trained_per_epoch = trainer._batch_num_total // (metrics["training_epochs"] + 1)
+        num_batches_expected = (
+            num_training_instances // self.iterator._batch_size // steps_to_accumulate
+        )
+
+        assert num_batches_trained_per_epoch == num_batches_expected
+
 
 class TestSparseClipGrad(AllenNlpTestCase):
     def test_sparse_clip_grad(self):

diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
@@ -60,6 +60,7 @@ def __init__(
         should_log_learning_rate: bool = False,
         log_batch_size_period: Optional[int] = None,
         moving_average: Optional[MovingAverage] = None,
+        num_gradient_accumulation_steps: int = 1,
     ) -> None:
         """
         A trainer for doing supervised learning. It just takes a labeled dataset
@@ -171,6 +172,10 @@ def __init__(
             parameters. Be careful that when saving the checkpoint, we will save the moving averages of
             parameters. This is necessary because we want the saved model to perform as well as the validated
             model if we load it later. But this may cause problems if you restart the training from checkpoint.
+        num_gradient_accumulation_steps: ``int``, optional, (default = 1)
+            Gradients are accumulated for the given number of steps before doing an optimizer step. This can
+            be useful to accommodate batches that are larger than the RAM size. Refer Thomas Wolf's
+            [post](https://tinyurl.com/y5mv44fw) for details on Gradient Accumulation.
         """
         super().__init__(serialization_dir, cuda_device)
 
@@ -250,6 +255,8 @@ def __init__(
 
         self._last_log = 0.0  # time of last logging
 
+        self._num_gradient_accumulation_steps = num_gradient_accumulation_steps
+
         # Enable activation logging.
         if histogram_interval is not None:
             self._tensorboard.enable_activation_logging(self.model)
@@ -300,12 +307,18 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
         # Set the model to "train" mode.
         self.model.train()
 
-        num_gpus = len(self._cuda_devices)
+        # A `batch_group` has chunks of tensors that form a single batch together for an optimizer
+        # step. A single chunk always contains as many instances as configured in the iterator's
+        # `batch_size` param. The number of chunks in a single `batch_group` is
+        # `num_gradient_accumulation_steps` * `num_gpus`.
+        batch_group_length = self._num_gradient_accumulation_steps * len(self._cuda_devices)
 
         # Get tqdm for the training batches
         raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle)
-        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
-        num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data) / num_gpus)
+        train_generator = lazy_groups_of(raw_train_generator, batch_group_length)
+        num_training_batches = math.ceil(
+            self.iterator.get_num_batches(self.train_data) / batch_group_length
+        )
         self._last_log = time.time()
         last_save_time = time.time()
 
@@ -325,14 +338,18 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
 
             self.optimizer.zero_grad()
 
-            loss = self.batch_loss(batch_group, for_training=True)
-
-            if torch.isnan(loss):
-                raise ValueError("nan loss encountered")
+            batches_for_step = list(lazy_groups_of(iter(batch_group), len(self._cuda_devices)))
+            for batch_for_step in batches_for_step:
+                loss = self.batch_loss(batch_for_step, for_training=True)
+                if torch.isnan(loss):
+                    raise ValueError("nan loss encountered")
 
-            loss.backward()
+                # `len(batches_for_step)` should always be `num_gradient_accumulation_steps`, except
+                # for the last batch in the epoch.
+                loss = loss / len(batches_for_step)
+                loss.backward()
 
-            train_loss += loss.item()
+                train_loss += loss.item()
 
             batch_grad_norm = self.rescale_gradients()
 
@@ -697,6 +714,7 @@ def from_params(  # type: ignore
         grad_clipping = params.pop_float("grad_clipping", None)
         lr_scheduler_params = params.pop("learning_rate_scheduler", None)
         momentum_scheduler_params = params.pop("momentum_scheduler", None)
+        num_gradient_accumulation_steps = params.pop("num_gradient_accumulation_steps", 1)
 
         if isinstance(cuda_device, list):
             model_device = cuda_device[0]
@@ -779,4 +797,5 @@ def from_params(  # type: ignore
             should_log_learning_rate=should_log_learning_rate,
             log_batch_size_period=log_batch_size_period,
             moving_average=moving_average,
+            num_gradient_accumulation_steps=num_gradient_accumulation_steps,
         )