From 2b8741c33eca7bfd8ea1555fe338b86d6a785230 Mon Sep 17 00:00:00 2001
From: eunwoosh <eunwoo.shin@intel.com>
Date: Tue, 13 Dec 2022 16:21:41 +0900
Subject: [PATCH] add multi gpu test code

---
 otx/cli/tools/train.py                        | 43 ++++++++-----------
 otx/cli/utils/tests.py                        |  4 ++
 .../cli/classification/test_classification.py | 24 +++++++++++
 .../cli/detection/test_detection.py           |  8 ++++
 .../detection/test_instance_segmentation.py   |  8 ++++
 .../cli/segmentation/test_segmentation.py     |  8 ++++
 6 files changed, 69 insertions(+), 26 deletions(-)

diff --git a/otx/cli/tools/train.py b/otx/cli/tools/train.py
index 67ff4c0a37c..11a87aed575 100644
--- a/otx/cli/tools/train.py
+++ b/otx/cli/tools/train.py
@@ -16,11 +16,11 @@
 
 import argparse
 import os
-import sys
 import signal
+import sys
 import threading
-from typing import List, Optional
 import time
+from typing import List, Optional
 
 import torch
 import torch.distributed as dist
@@ -221,10 +221,7 @@ def main():
     if args.gpus:
         multigpu_manager = MultiGPUManager(args.gpus, str(args.multi_gpu_port))
         if multigpu_manager.is_available(template):
-            multigpu_manager.setup_multi_gpu_train(
-                task.project_path,
-                hyper_parameters if args.enable_hpo else None
-            )
+            multigpu_manager.setup_multi_gpu_train(task.project_path, hyper_parameters if args.enable_hpo else None)
 
     output_model = ModelEntity(dataset, environment.get_model_configuration())
 
@@ -252,6 +249,7 @@ def main():
     if args.gpus:
         multigpu_manager.finalize()
 
+
 class MultiGPUManager:
     def __init__(self, gpu_ids: str, multi_gpu_port: str):
         self._gpu_ids = self._get_gpu_ids(gpu_ids)
@@ -262,7 +260,7 @@ def __init__(self, gpu_ids: str, multi_gpu_port: str):
     def _get_gpu_ids(self, gpus: str) -> List[int]:
         num_available_gpu = torch.cuda.device_count()
         gpu_ids = []
-        for gpu_id in gpus.split(','):
+        for gpu_id in gpus.split(","):
             if not gpu_id.isnumeric():
                 raise RuntimeError("--gpus argument should be numbers separated by ','.")
             gpu_ids.append(int(gpu_id))
@@ -284,9 +282,7 @@ def is_available(self, template) -> bool:
         return len(self._gpu_ids) > 1 and not template.task_type.is_anomaly
 
     def setup_multi_gpu_train(
-        self,
-        output_path: str,
-        optimized_hyper_parameters: Optional[ConfigurableParameters] = None
+        self, output_path: str, optimized_hyper_parameters: Optional[ConfigurableParameters] = None
     ):
         if optimized_hyper_parameters is not None:
             self._set_optimized_hp_for_child_process(optimized_hyper_parameters)
@@ -308,19 +304,19 @@ def finalize(self):
 
     @staticmethod
     def initialize_multigpu_train(rank: int, gpu_ids: List[int], multi_gpu_port: str):
-        os.environ['MASTER_ADDR'] = 'localhost'
-        os.environ['MASTER_PORT'] = multi_gpu_port
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = multi_gpu_port
         torch.cuda.set_device(gpu_ids[rank])
-        dist.init_process_group(backend='nccl', world_size=len(gpu_ids), rank=rank)
-        print(f'dist info world_size = {dist.get_world_size()}, rank = {dist.get_rank()}')
+        dist.init_process_group(backend="nccl", world_size=len(gpu_ids), rank=rank)
+        print(f"dist info world_size = {dist.get_world_size()}, rank = {dist.get_rank()}")
 
     @staticmethod
     def run_child_process(rank: int, gpu_ids: List[int], output_path: str, multi_gpu_port: str):
-        gpus_arg_idx = sys.argv.index('--gpus')
+        gpus_arg_idx = sys.argv.index("--gpus")
         for _ in range(2):
             sys.argv.pop(gpus_arg_idx)
         if "--enable-hpo" in sys.argv:
-            sys.argv.remove('--enable-hpo')
+            sys.argv.remove("--enable-hpo")
         MultiGPUManager.set_arguments_to_argv("--save-logs-to", output_path)
 
         MultiGPUManager.initialize_multigpu_train(rank, gpu_ids, multi_gpu_port)
@@ -337,16 +333,15 @@ def set_arguments_to_argv(key: str, value: str, after_params: bool = False):
                 sys.argv.insert(sys.argv.index("params"), value)
             else:
                 if after_params and "params" not in sys.argv:
-                    sys.argv.append('params')
+                    sys.argv.append("params")
                 sys.argv.extend([key, value])
 
     def _spawn_multi_gpu_processes(self, output_path: str) -> List[mp.Process]:
-        processes= []
+        processes = []
         spawned_mp = mp.get_context("spawn")
         for rank in range(1, len(self._gpu_ids)):
             task_p = spawned_mp.Process(
-                target=MultiGPUManager.run_child_process,
-                args=(rank, self._gpu_ids, output_path, self._multi_gpu_port)
+                target=MultiGPUManager.run_child_process, args=(rank, self._gpu_ids, output_path, self._multi_gpu_port)
             )
             task_p.start()
             processes.append(task_p)
@@ -378,14 +373,10 @@ def _kill_child_process(self):
 
     def _set_optimized_hp_for_child_process(self, hyper_parameters: ConfigurableParameters):
         self.set_arguments_to_argv(
-            "--learning_parameters.learning_rate",
-            str(hyper_parameters.learning_parameters.learning_rate),
-            True
+            "--learning_parameters.learning_rate", str(hyper_parameters.learning_parameters.learning_rate), True
         )
         self.set_arguments_to_argv(
-            "--learning_parameters.batch_size",
-            str(hyper_parameters.learning_parameters.batch_size),
-            True
+            "--learning_parameters.batch_size", str(hyper_parameters.learning_parameters.batch_size), True
         )
 
     def _check_child_processes_alive(self):
diff --git a/otx/cli/utils/tests.py b/otx/cli/utils/tests.py
index 486401e9801..15ea6129b13 100644
--- a/otx/cli/utils/tests.py
+++ b/otx/cli/utils/tests.py
@@ -68,6 +68,10 @@ def otx_train_testing(template, root, otx_dir, args):
         "--save-model-to",
         f"{template_work_dir}/trained_{template.model_template_id}",
     ]
+    if "--gpus" in args:
+        command_line.extend(["--gpus", args["--gpus"]])
+        if "--multi-gpu-port" in args:
+            command_line.extend(["--multi-gpu-port", args["--multi-gpu-port"]])
     if "--load-weights" in args:
         command_line.extend(["--load-weights", f'{os.path.join(otx_dir, args["--load-weights"])}'])
     command_line.extend(args["train_params"])
diff --git a/tests/integration/cli/classification/test_classification.py b/tests/integration/cli/classification/test_classification.py
index f3b55392b7a..ab24c7a1ece 100644
--- a/tests/integration/cli/classification/test_classification.py
+++ b/tests/integration/cli/classification/test_classification.py
@@ -105,6 +105,14 @@ def test_otx_train(self, template, tmp_dir_path):
         args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
         otx_train_testing(template, tmp_dir_path, otx_dir, args1)
 
+    @e2e_pytest_component
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train(self, template, tmp_dir_path):
+        args = args.copy()
+        args["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args)
+
     @e2e_pytest_component
     @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
     @pytest.mark.parametrize("template", templates, ids=templates_ids)
@@ -261,6 +269,14 @@ def test_otx_train(self, template, tmp_dir_path):
         args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
         otx_train_testing(template, tmp_dir_path, otx_dir, args1)
 
+    @e2e_pytest_component
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train(self, template, tmp_dir_path):
+        args0 = args0_m.copy()
+        args0["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args0)
+
     @e2e_pytest_component
     @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
     @pytest.mark.parametrize("template", templates, ids=templates_ids)
@@ -402,6 +418,14 @@ def test_otx_train(self, template, tmp_dir_path):
         args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
         otx_train_testing(template, tmp_dir_path, otx_dir, args1)
 
+    @e2e_pytest_component
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train(self, template, tmp_dir_path):
+        args1 = args_h.copy()
+        args1["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args1)
+
     @e2e_pytest_component
     @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
     @pytest.mark.parametrize("template", templates, ids=templates_ids)
diff --git a/tests/integration/cli/detection/test_detection.py b/tests/integration/cli/detection/test_detection.py
index e29d16b8134..83d68c61847 100644
--- a/tests/integration/cli/detection/test_detection.py
+++ b/tests/integration/cli/detection/test_detection.py
@@ -86,6 +86,14 @@ def test_otx_train(self, template, tmp_dir_path):
         args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
         otx_train_testing(template, tmp_dir_path, otx_dir, args1)
 
+    @e2e_pytest_component
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train(self, template, tmp_dir_path):
+        args1 = args.copy()
+        args1["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args1)
+
     @e2e_pytest_component
     @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
     @pytest.mark.parametrize("template", templates, ids=templates_ids)
diff --git a/tests/integration/cli/detection/test_instance_segmentation.py b/tests/integration/cli/detection/test_instance_segmentation.py
index 2ccaac79072..3259bc60aa9 100644
--- a/tests/integration/cli/detection/test_instance_segmentation.py
+++ b/tests/integration/cli/detection/test_instance_segmentation.py
@@ -86,6 +86,14 @@ def test_otx_train(self, template, tmp_dir_path):
         args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
         otx_train_testing(template, tmp_dir_path, otx_dir, args1)
 
+    @e2e_pytest_component
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train(self, template, tmp_dir_path):
+        args1 = args.copy()
+        args1["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args1)
+
     @e2e_pytest_component
     @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
     @pytest.mark.parametrize("template", templates, ids=templates_ids)
diff --git a/tests/integration/cli/segmentation/test_segmentation.py b/tests/integration/cli/segmentation/test_segmentation.py
index 1e1cd4ba203..dc8fbab768b 100644
--- a/tests/integration/cli/segmentation/test_segmentation.py
+++ b/tests/integration/cli/segmentation/test_segmentation.py
@@ -83,6 +83,14 @@ def test_otx_train(self, template, tmp_dir_path):
         args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
         otx_train_testing(template, tmp_dir_path, otx_dir, args1)
 
+    @e2e_pytest_component
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train(self, template, tmp_dir_path):
+        args1 = args.copy()
+        args1["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args1)
+
     @e2e_pytest_component
     @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
     @pytest.mark.parametrize("template", templates, ids=templates_ids)