From 2b8741c33eca7bfd8ea1555fe338b86d6a785230 Mon Sep 17 00:00:00 2001 From: eunwoosh Date: Tue, 13 Dec 2022 16:21:41 +0900 Subject: [PATCH] add multi gpu test code --- otx/cli/tools/train.py | 43 ++++++++----------- otx/cli/utils/tests.py | 4 ++ .../cli/classification/test_classification.py | 24 +++++++++++ .../cli/detection/test_detection.py | 8 ++++ .../detection/test_instance_segmentation.py | 8 ++++ .../cli/segmentation/test_segmentation.py | 8 ++++ 6 files changed, 69 insertions(+), 26 deletions(-) diff --git a/otx/cli/tools/train.py b/otx/cli/tools/train.py index 67ff4c0a37c..11a87aed575 100644 --- a/otx/cli/tools/train.py +++ b/otx/cli/tools/train.py @@ -16,11 +16,11 @@ import argparse import os -import sys import signal +import sys import threading -from typing import List, Optional import time +from typing import List, Optional import torch import torch.distributed as dist @@ -221,10 +221,7 @@ def main(): if args.gpus: multigpu_manager = MultiGPUManager(args.gpus, str(args.multi_gpu_port)) if multigpu_manager.is_available(template): - multigpu_manager.setup_multi_gpu_train( - task.project_path, - hyper_parameters if args.enable_hpo else None - ) + multigpu_manager.setup_multi_gpu_train(task.project_path, hyper_parameters if args.enable_hpo else None) output_model = ModelEntity(dataset, environment.get_model_configuration()) @@ -252,6 +249,7 @@ def main(): if args.gpus: multigpu_manager.finalize() + class MultiGPUManager: def __init__(self, gpu_ids: str, multi_gpu_port: str): self._gpu_ids = self._get_gpu_ids(gpu_ids) @@ -262,7 +260,7 @@ def __init__(self, gpu_ids: str, multi_gpu_port: str): def _get_gpu_ids(self, gpus: str) -> List[int]: num_available_gpu = torch.cuda.device_count() gpu_ids = [] - for gpu_id in gpus.split(','): + for gpu_id in gpus.split(","): if not gpu_id.isnumeric(): raise RuntimeError("--gpus argument should be numbers separated by ','.") gpu_ids.append(int(gpu_id)) @@ -284,9 +282,7 @@ def is_available(self, template) -> bool: return len(self._gpu_ids) > 1 and not template.task_type.is_anomaly def setup_multi_gpu_train( - self, - output_path: str, - optimized_hyper_parameters: Optional[ConfigurableParameters] = None + self, output_path: str, optimized_hyper_parameters: Optional[ConfigurableParameters] = None ): if optimized_hyper_parameters is not None: self._set_optimized_hp_for_child_process(optimized_hyper_parameters) @@ -308,19 +304,19 @@ def finalize(self): @staticmethod def initialize_multigpu_train(rank: int, gpu_ids: List[int], multi_gpu_port: str): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = multi_gpu_port + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = multi_gpu_port torch.cuda.set_device(gpu_ids[rank]) - dist.init_process_group(backend='nccl', world_size=len(gpu_ids), rank=rank) - print(f'dist info world_size = {dist.get_world_size()}, rank = {dist.get_rank()}') + dist.init_process_group(backend="nccl", world_size=len(gpu_ids), rank=rank) + print(f"dist info world_size = {dist.get_world_size()}, rank = {dist.get_rank()}") @staticmethod def run_child_process(rank: int, gpu_ids: List[int], output_path: str, multi_gpu_port: str): - gpus_arg_idx = sys.argv.index('--gpus') + gpus_arg_idx = sys.argv.index("--gpus") for _ in range(2): sys.argv.pop(gpus_arg_idx) if "--enable-hpo" in sys.argv: - sys.argv.remove('--enable-hpo') + sys.argv.remove("--enable-hpo") MultiGPUManager.set_arguments_to_argv("--save-logs-to", output_path) MultiGPUManager.initialize_multigpu_train(rank, gpu_ids, multi_gpu_port) @@ -337,16 +333,15 @@ def set_arguments_to_argv(key: str, value: str, after_params: bool = False): sys.argv.insert(sys.argv.index("params"), value) else: if after_params and "params" not in sys.argv: - sys.argv.append('params') + sys.argv.append("params") sys.argv.extend([key, value]) def _spawn_multi_gpu_processes(self, output_path: str) -> List[mp.Process]: - processes= [] + processes = [] spawned_mp = mp.get_context("spawn") for rank in range(1, len(self._gpu_ids)): task_p = spawned_mp.Process( - target=MultiGPUManager.run_child_process, - args=(rank, self._gpu_ids, output_path, self._multi_gpu_port) + target=MultiGPUManager.run_child_process, args=(rank, self._gpu_ids, output_path, self._multi_gpu_port) ) task_p.start() processes.append(task_p) @@ -378,14 +373,10 @@ def _kill_child_process(self): def _set_optimized_hp_for_child_process(self, hyper_parameters: ConfigurableParameters): self.set_arguments_to_argv( - "--learning_parameters.learning_rate", - str(hyper_parameters.learning_parameters.learning_rate), - True + "--learning_parameters.learning_rate", str(hyper_parameters.learning_parameters.learning_rate), True ) self.set_arguments_to_argv( - "--learning_parameters.batch_size", - str(hyper_parameters.learning_parameters.batch_size), - True + "--learning_parameters.batch_size", str(hyper_parameters.learning_parameters.batch_size), True ) def _check_child_processes_alive(self): diff --git a/otx/cli/utils/tests.py b/otx/cli/utils/tests.py index 486401e9801..15ea6129b13 100644 --- a/otx/cli/utils/tests.py +++ b/otx/cli/utils/tests.py @@ -68,6 +68,10 @@ def otx_train_testing(template, root, otx_dir, args): "--save-model-to", f"{template_work_dir}/trained_{template.model_template_id}", ] + if "--gpus" in args: + command_line.extend(["--gpus", args["--gpus"]]) + if "--multi-gpu-port" in args: + command_line.extend(["--multi-gpu-port", args["--multi-gpu-port"]]) if "--load-weights" in args: command_line.extend(["--load-weights", f'{os.path.join(otx_dir, args["--load-weights"])}']) command_line.extend(args["train_params"]) diff --git a/tests/integration/cli/classification/test_classification.py b/tests/integration/cli/classification/test_classification.py index f3b55392b7a..ab24c7a1ece 100644 --- a/tests/integration/cli/classification/test_classification.py +++ b/tests/integration/cli/classification/test_classification.py @@ -105,6 +105,14 @@ def test_otx_train(self, template, tmp_dir_path): args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth" otx_train_testing(template, tmp_dir_path, otx_dir, args1) + @e2e_pytest_component + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train(self, template, tmp_dir_path): + args = args.copy() + args["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args) + @e2e_pytest_component @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") @pytest.mark.parametrize("template", templates, ids=templates_ids) @@ -261,6 +269,14 @@ def test_otx_train(self, template, tmp_dir_path): args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth" otx_train_testing(template, tmp_dir_path, otx_dir, args1) + @e2e_pytest_component + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train(self, template, tmp_dir_path): + args0 = args0_m.copy() + args0["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args0) + @e2e_pytest_component @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") @pytest.mark.parametrize("template", templates, ids=templates_ids) @@ -402,6 +418,14 @@ def test_otx_train(self, template, tmp_dir_path): args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth" otx_train_testing(template, tmp_dir_path, otx_dir, args1) + @e2e_pytest_component + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train(self, template, tmp_dir_path): + args1 = args_h.copy() + args1["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args1) + @e2e_pytest_component @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") @pytest.mark.parametrize("template", templates, ids=templates_ids) diff --git a/tests/integration/cli/detection/test_detection.py b/tests/integration/cli/detection/test_detection.py index e29d16b8134..83d68c61847 100644 --- a/tests/integration/cli/detection/test_detection.py +++ b/tests/integration/cli/detection/test_detection.py @@ -86,6 +86,14 @@ def test_otx_train(self, template, tmp_dir_path): args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth" otx_train_testing(template, tmp_dir_path, otx_dir, args1) + @e2e_pytest_component + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train(self, template, tmp_dir_path): + args1 = args.copy() + args1["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args1) + @e2e_pytest_component @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") @pytest.mark.parametrize("template", templates, ids=templates_ids) diff --git a/tests/integration/cli/detection/test_instance_segmentation.py b/tests/integration/cli/detection/test_instance_segmentation.py index 2ccaac79072..3259bc60aa9 100644 --- a/tests/integration/cli/detection/test_instance_segmentation.py +++ b/tests/integration/cli/detection/test_instance_segmentation.py @@ -86,6 +86,14 @@ def test_otx_train(self, template, tmp_dir_path): args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth" otx_train_testing(template, tmp_dir_path, otx_dir, args1) + @e2e_pytest_component + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train(self, template, tmp_dir_path): + args1 = args.copy() + args1["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args1) + @e2e_pytest_component @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") @pytest.mark.parametrize("template", templates, ids=templates_ids) diff --git a/tests/integration/cli/segmentation/test_segmentation.py b/tests/integration/cli/segmentation/test_segmentation.py index 1e1cd4ba203..dc8fbab768b 100644 --- a/tests/integration/cli/segmentation/test_segmentation.py +++ b/tests/integration/cli/segmentation/test_segmentation.py @@ -83,6 +83,14 @@ def test_otx_train(self, template, tmp_dir_path): args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth" otx_train_testing(template, tmp_dir_path, otx_dir, args1) + @e2e_pytest_component + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train(self, template, tmp_dir_path): + args1 = args.copy() + args1["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args1) + @e2e_pytest_component @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") @pytest.mark.parametrize("template", templates, ids=templates_ids)