diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 36258c1508f9..63876e053ad4 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1684,7 +1684,9 @@ def _setup_devices(self) -> "torch.device":
                         )
                     device = torch.device("mps")
                     self._n_gpu = 1
-
+            elif self.no_cuda:
+                device = torch.device("cpu")
+                self._n_gpu = 0
             else:
                 # if n_gpu is > 1 we'll use nn.DataParallel.
                 # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`