Skip to content

Commit

Permalink
add multi gpu test code
Browse files Browse the repository at this point in the history
  • Loading branch information
eunwoosh committed Dec 13, 2022
1 parent a7d48b0 commit 2b8741c
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 26 deletions.
43 changes: 17 additions & 26 deletions otx/cli/tools/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@

import argparse
import os
import sys
import signal
import sys
import threading
from typing import List, Optional
import time
from typing import List, Optional

import torch
import torch.distributed as dist
Expand Down Expand Up @@ -221,10 +221,7 @@ def main():
if args.gpus:
multigpu_manager = MultiGPUManager(args.gpus, str(args.multi_gpu_port))
if multigpu_manager.is_available(template):
multigpu_manager.setup_multi_gpu_train(
task.project_path,
hyper_parameters if args.enable_hpo else None
)
multigpu_manager.setup_multi_gpu_train(task.project_path, hyper_parameters if args.enable_hpo else None)

output_model = ModelEntity(dataset, environment.get_model_configuration())

Expand Down Expand Up @@ -252,6 +249,7 @@ def main():
if args.gpus:
multigpu_manager.finalize()


class MultiGPUManager:
def __init__(self, gpu_ids: str, multi_gpu_port: str):
self._gpu_ids = self._get_gpu_ids(gpu_ids)
Expand All @@ -262,7 +260,7 @@ def __init__(self, gpu_ids: str, multi_gpu_port: str):
def _get_gpu_ids(self, gpus: str) -> List[int]:
num_available_gpu = torch.cuda.device_count()
gpu_ids = []
for gpu_id in gpus.split(','):
for gpu_id in gpus.split(","):
if not gpu_id.isnumeric():
raise RuntimeError("--gpus argument should be numbers separated by ','.")
gpu_ids.append(int(gpu_id))
Expand All @@ -284,9 +282,7 @@ def is_available(self, template) -> bool:
return len(self._gpu_ids) > 1 and not template.task_type.is_anomaly

def setup_multi_gpu_train(
self,
output_path: str,
optimized_hyper_parameters: Optional[ConfigurableParameters] = None
self, output_path: str, optimized_hyper_parameters: Optional[ConfigurableParameters] = None
):
if optimized_hyper_parameters is not None:
self._set_optimized_hp_for_child_process(optimized_hyper_parameters)
Expand All @@ -308,19 +304,19 @@ def finalize(self):

@staticmethod
def initialize_multigpu_train(rank: int, gpu_ids: List[int], multi_gpu_port: str):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = multi_gpu_port
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = multi_gpu_port
torch.cuda.set_device(gpu_ids[rank])
dist.init_process_group(backend='nccl', world_size=len(gpu_ids), rank=rank)
print(f'dist info world_size = {dist.get_world_size()}, rank = {dist.get_rank()}')
dist.init_process_group(backend="nccl", world_size=len(gpu_ids), rank=rank)
print(f"dist info world_size = {dist.get_world_size()}, rank = {dist.get_rank()}")

@staticmethod
def run_child_process(rank: int, gpu_ids: List[int], output_path: str, multi_gpu_port: str):
gpus_arg_idx = sys.argv.index('--gpus')
gpus_arg_idx = sys.argv.index("--gpus")
for _ in range(2):
sys.argv.pop(gpus_arg_idx)
if "--enable-hpo" in sys.argv:
sys.argv.remove('--enable-hpo')
sys.argv.remove("--enable-hpo")
MultiGPUManager.set_arguments_to_argv("--save-logs-to", output_path)

MultiGPUManager.initialize_multigpu_train(rank, gpu_ids, multi_gpu_port)
Expand All @@ -337,16 +333,15 @@ def set_arguments_to_argv(key: str, value: str, after_params: bool = False):
sys.argv.insert(sys.argv.index("params"), value)
else:
if after_params and "params" not in sys.argv:
sys.argv.append('params')
sys.argv.append("params")
sys.argv.extend([key, value])

def _spawn_multi_gpu_processes(self, output_path: str) -> List[mp.Process]:
processes= []
processes = []
spawned_mp = mp.get_context("spawn")
for rank in range(1, len(self._gpu_ids)):
task_p = spawned_mp.Process(
target=MultiGPUManager.run_child_process,
args=(rank, self._gpu_ids, output_path, self._multi_gpu_port)
target=MultiGPUManager.run_child_process, args=(rank, self._gpu_ids, output_path, self._multi_gpu_port)
)
task_p.start()
processes.append(task_p)
Expand Down Expand Up @@ -378,14 +373,10 @@ def _kill_child_process(self):

def _set_optimized_hp_for_child_process(self, hyper_parameters: ConfigurableParameters):
self.set_arguments_to_argv(
"--learning_parameters.learning_rate",
str(hyper_parameters.learning_parameters.learning_rate),
True
"--learning_parameters.learning_rate", str(hyper_parameters.learning_parameters.learning_rate), True
)
self.set_arguments_to_argv(
"--learning_parameters.batch_size",
str(hyper_parameters.learning_parameters.batch_size),
True
"--learning_parameters.batch_size", str(hyper_parameters.learning_parameters.batch_size), True
)

def _check_child_processes_alive(self):
Expand Down
4 changes: 4 additions & 0 deletions otx/cli/utils/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ def otx_train_testing(template, root, otx_dir, args):
"--save-model-to",
f"{template_work_dir}/trained_{template.model_template_id}",
]
if "--gpus" in args:
command_line.extend(["--gpus", args["--gpus"]])
if "--multi-gpu-port" in args:
command_line.extend(["--multi-gpu-port", args["--multi-gpu-port"]])
if "--load-weights" in args:
command_line.extend(["--load-weights", f'{os.path.join(otx_dir, args["--load-weights"])}'])
command_line.extend(args["train_params"])
Expand Down
24 changes: 24 additions & 0 deletions tests/integration/cli/classification/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ def test_otx_train(self, template, tmp_dir_path):
args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
otx_train_testing(template, tmp_dir_path, otx_dir, args1)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
def test_otx_multi_gpu_train(self, template, tmp_dir_path):
args = args.copy()
args["--gpus"] = "0,1"
otx_train_testing(template, tmp_dir_path, otx_dir, args)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
Expand Down Expand Up @@ -261,6 +269,14 @@ def test_otx_train(self, template, tmp_dir_path):
args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
otx_train_testing(template, tmp_dir_path, otx_dir, args1)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
def test_otx_multi_gpu_train(self, template, tmp_dir_path):
args0 = args0_m.copy()
args0["--gpus"] = "0,1"
otx_train_testing(template, tmp_dir_path, otx_dir, args0)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
Expand Down Expand Up @@ -402,6 +418,14 @@ def test_otx_train(self, template, tmp_dir_path):
args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
otx_train_testing(template, tmp_dir_path, otx_dir, args1)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
def test_otx_multi_gpu_train(self, template, tmp_dir_path):
args1 = args_h.copy()
args1["--gpus"] = "0,1"
otx_train_testing(template, tmp_dir_path, otx_dir, args1)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
Expand Down
8 changes: 8 additions & 0 deletions tests/integration/cli/detection/test_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ def test_otx_train(self, template, tmp_dir_path):
args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
otx_train_testing(template, tmp_dir_path, otx_dir, args1)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
def test_otx_multi_gpu_train(self, template, tmp_dir_path):
args1 = args.copy()
args1["--gpus"] = "0,1"
otx_train_testing(template, tmp_dir_path, otx_dir, args1)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
Expand Down
8 changes: 8 additions & 0 deletions tests/integration/cli/detection/test_instance_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ def test_otx_train(self, template, tmp_dir_path):
args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
otx_train_testing(template, tmp_dir_path, otx_dir, args1)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
def test_otx_multi_gpu_train(self, template, tmp_dir_path):
args1 = args.copy()
args1["--gpus"] = "0,1"
otx_train_testing(template, tmp_dir_path, otx_dir, args1)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
Expand Down
8 changes: 8 additions & 0 deletions tests/integration/cli/segmentation/test_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@ def test_otx_train(self, template, tmp_dir_path):
args1["--load-weights"] = f"{template_work_dir}/trained_{template.model_template_id}/weights.pth"
otx_train_testing(template, tmp_dir_path, otx_dir, args1)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
def test_otx_multi_gpu_train(self, template, tmp_dir_path):
args1 = args.copy()
args1["--gpus"] = "0,1"
otx_train_testing(template, tmp_dir_path, otx_dir, args1)

@e2e_pytest_component
@pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
@pytest.mark.parametrize("template", templates, ids=templates_ids)
Expand Down

0 comments on commit 2b8741c

Please sign in to comment.