Fix amp_bf16 train with staged_train_test (#2198)

Summary: This PR is to fix the issue of amp_bf16 train with staged_train_test in CPU. Need set `forward_contexts` correctly with `torch.cpu.amp.autocast(dtype=torch.bfloat16)`, otherwise, in staged_train_test, model cannot run into bf16 successfully. Pull Request resolved: #2198 Reviewed By: aaronenyeshi Differential Revision: D55428830 Pulled By: xuzhao9 fbshipit-source-id: 67d83d4c2948c571ef9416881170eee569c9f152
pytorch · Mar 27, 2024 · 7cf8cb9 · 7cf8cb9
1 parent 697752c
commit 7cf8cb9
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 5 deletions.
diff --git a/torchbenchmark/util/extra_args.py b/torchbenchmark/util/extra_args.py
@@ -32,7 +32,10 @@ def check_precision(model: 'torchbenchmark.util.model.BenchmarkModel', precision
         if model.test == 'train' and model.device == 'cuda':
             return hasattr(model, 'enable_amp') or is_staged_train_test(model)
     if precision == "amp_bf16":
-        return model.device == 'cpu'
+        if model.test == 'eval' and model.device == 'cpu':
+            return True
+        if model.test == 'train' and model.device == 'cpu':
+            return hasattr(model, 'enable_amp') or is_staged_train_test(model)
     assert precision == "fp32", f"Expected precision to be one of {AVAILABLE_PRECISIONS}, but get {precision}"
     return True
 
@@ -113,8 +116,19 @@ def apply_decoration_args(model: 'torchbenchmark.util.model.BenchmarkModel', dar
             import torch
             model.add_context(lambda: torch.cuda.amp.autocast(dtype=torch.float16), stage=TEST_STAGE.FORWARD)
     elif dargs.precision == "amp_bf16":
-        import torch
-        model.add_context(lambda: torch.cpu.amp.autocast(dtype=torch.bfloat16))
+        assert model.device == "cpu", "amp_bf16 is only supported on cpu device."
+        if model.test == "eval":
+            import torch
+            model.add_context(lambda: torch.cpu.amp.autocast(dtype=torch.bfloat16))
+        elif model.test == "train":
+            if is_staged_train_test(model):
+                import torch
+                model.add_context(lambda: torch.cpu.amp.autocast(dtype=torch.bfloat16), stage=TEST_STAGE.FORWARD)
+            else:
+                if hasattr(model, 'enable_amp'):
+                    model.enable_amp()
+                else:
+                    assert False, f"model has no enable_amp()"
     elif not dargs.precision == "fp32":
         assert False, f"Get an invalid precision option: {dargs.precision}. Please report a bug."
 

diff --git a/torchbenchmark/util/framework/vision/model_factory.py b/torchbenchmark/util/framework/vision/model_factory.py
@@ -74,8 +74,11 @@ def forward(self):
         with torch.no_grad():
             self.example_outputs = (torch.rand_like(self.model(*self.example_inputs)), )
         for data, target in zip(self.example_inputs, self.example_outputs):
-            pred = self.model(data)
-            return self.loss_fn(pred, target)
+            # Alexnet returns non-grad tensors in forward pass
+            # Force to call requires_grad_(True) here
+            pred = self.model(data).requires_grad_(True)
+            u = self.loss_fn(pred, target)
+            return u
 
     def backward(self, loss):
         loss.backward()