Lightning-AI · williamFalcon · Aug 14, 2020 · Aug 14, 2020 · Aug 14, 2020 · Aug 14, 2020
@@ -86,12 +86,18 @@ def spawn_ddp_children(self, model):
         # use the same python interpreter and actually running
         command = [sys.executable] + command
 
-        # since this script sets the visible devices we replace the gpus flag with a number
-        num_gpus = torch.cuda.device_count()
-
-        if '--gpus' in command:
-            gpu_flag_idx = command.index('--gpus')
-            command[gpu_flag_idx + 1] = f'{num_gpus}'
+        # the visible devices tell us how many GPUs we want to use.
+        # when the trainer script was called the device has already been scoped by the time
+        # code reaches this point. so, to call the scripts, we need to leave cuda visible devices alone
+        # but forward the GPUs selected via environment variables
+        gpu_ids = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+        if len(gpu_ids) == 1:
+            gpu_ids = f'{gpu_ids},'
+
+        num_gpus = max(1, len(gpu_ids.split(',')))
+
+        # set the flag for ddp scripts
+        os.environ['PL_TRAINER_GPUS'] = gpu_ids
 
         os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}'
 

@@ -498,6 +498,9 @@ def __init__(
         self.accumulate_grad_batches = accumulate_grad_batches
         self.configure_accumulated_gradients(accumulate_grad_batches)
 
+        # override with environment flag
+        gpus = os.environ.get('PL_TRAINER_GPUS', gpus)
+
         # for gpus allow int, string and gpu list
         if auto_select_gpus and isinstance(gpus, int):
             self.gpus = pick_multiple_gpus(gpus)