From 44c72b6676194dd8942b66acba1123772389fdc1 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" Date: Thu, 14 Mar 2024 17:26:33 +0900 Subject: [PATCH 1/6] change code using fixed subset name in hpo --- src/otx/core/model/module/detection.py | 2 +- src/otx/core/model/module/instance_segmentation.py | 2 +- src/otx/engine/hpo/hpo_api.py | 11 +++++++++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/otx/core/model/module/detection.py b/src/otx/core/model/module/detection.py index 8d5ea504ad0..d49956c6b2d 100644 --- a/src/otx/core/model/module/detection.py +++ b/src/otx/core/model/module/detection.py @@ -55,7 +55,7 @@ def load_state_dict(self, ckpt: dict[str, Any], *args, **kwargs) -> None: if "confidence_threshold" in ckpt: self.test_meta_info["best_confidence_threshold"] = ckpt["confidence_threshold"] self.test_meta_info["vary_confidence_threshold"] = False - elif "confidence_threshold" in ckpt["hyper_parameters"]: + elif "hyper_parameters" in ckpt and "confidence_threshold" in ckpt["hyper_parameters"]: self.test_meta_info["best_confidence_threshold"] = ckpt["hyper_parameters"]["confidence_threshold"] self.test_meta_info["vary_confidence_threshold"] = False super().load_state_dict(ckpt, *args, **kwargs) diff --git a/src/otx/core/model/module/instance_segmentation.py b/src/otx/core/model/module/instance_segmentation.py index 60aff90d0ca..b4c55777565 100644 --- a/src/otx/core/model/module/instance_segmentation.py +++ b/src/otx/core/model/module/instance_segmentation.py @@ -58,7 +58,7 @@ def load_state_dict(self, ckpt: dict[str, Any], *args, **kwargs) -> None: if "confidence_threshold" in ckpt: self.test_meta_info["best_confidence_threshold"] = ckpt["confidence_threshold"] self.test_meta_info["vary_confidence_threshold"] = False - elif "confidence_threshold" in ckpt["hyper_parameters"]: + elif "hyper_parameters" in ckpt and "confidence_threshold" in ckpt["hyper_parameters"]: self.test_meta_info["best_confidence_threshold"] = ckpt["hyper_parameters"]["confidence_threshold"] self.test_meta_info["vary_confidence_threshold"] = False super().load_state_dict(ckpt, *args, **kwargs) diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py index bcafb6039ae..43b96d74455 100644 --- a/src/otx/engine/hpo/hpo_api.py +++ b/src/otx/engine/hpo/hpo_api.py @@ -133,8 +133,15 @@ def hpo_config(self) -> dict[str, Any]: @hpo_config.setter def hpo_config(self, hpo_config: HpoConfig | None) -> None: - train_dataset_size = len(self._engine.datamodule.subsets["train"]) - val_dataset_size = len(self._engine.datamodule.subsets["val"]) + train_dataset_size = len( + self._engine.datamodule.subsets[self._engine.datamodule.config.train_subset.subset_name], + ) + if self._engine.datamodule.config.val_subset.subset_name in self._engine.datamodule.subsets: + val_dataset_size = len( + self._engine.datamodule.subsets[self._engine.datamodule.config.val_subset.subset_name], + ) + else: + val_dataset_size = 1 self._hpo_config: dict[str, Any] = { # default setting "save_path": str(self._hpo_workdir), From 39bb69f480047100acf076d8c22efa2ca0b4f721 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" Date: Mon, 18 Mar 2024 11:32:08 +0900 Subject: [PATCH 2/6] apply comment --- src/otx/engine/hpo/hpo_api.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py index 43b96d74455..4624bf565c6 100644 --- a/src/otx/engine/hpo/hpo_api.py +++ b/src/otx/engine/hpo/hpo_api.py @@ -133,13 +133,9 @@ def hpo_config(self) -> dict[str, Any]: @hpo_config.setter def hpo_config(self, hpo_config: HpoConfig | None) -> None: - train_dataset_size = len( - self._engine.datamodule.subsets[self._engine.datamodule.config.train_subset.subset_name], - ) + train_dataset_size = len(self._engine.datamodule.train_dataloader()) if self._engine.datamodule.config.val_subset.subset_name in self._engine.datamodule.subsets: - val_dataset_size = len( - self._engine.datamodule.subsets[self._engine.datamodule.config.val_subset.subset_name], - ) + val_dataset_size = len(self._engine.datamodule.val_dataloader()) else: val_dataset_size = 1 From 7e5b14dc1c60c48ca32336c4859befc531c13b82 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" Date: Fri, 22 Mar 2024 17:10:21 +0900 Subject: [PATCH 3/6] remove non_pure_train_ratio from HPO --- src/otx/engine/hpo/hpo_api.py | 5 ----- src/otx/hpo/hpo_base.py | 9 --------- 2 files changed, 14 deletions(-) diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py index 4624bf565c6..e45b88e06af 100644 --- a/src/otx/engine/hpo/hpo_api.py +++ b/src/otx/engine/hpo/hpo_api.py @@ -134,16 +134,11 @@ def hpo_config(self) -> dict[str, Any]: @hpo_config.setter def hpo_config(self, hpo_config: HpoConfig | None) -> None: train_dataset_size = len(self._engine.datamodule.train_dataloader()) - if self._engine.datamodule.config.val_subset.subset_name in self._engine.datamodule.subsets: - val_dataset_size = len(self._engine.datamodule.val_dataloader()) - else: - val_dataset_size = 1 self._hpo_config: dict[str, Any] = { # default setting "save_path": str(self._hpo_workdir), "num_full_iterations": self._max_epoch, "full_dataset_size": train_dataset_size, - "non_pure_train_ratio": val_dataset_size / (train_dataset_size + val_dataset_size), } if hpo_config is not None: diff --git a/src/otx/hpo/hpo_base.py b/src/otx/hpo/hpo_base.py index f0e03452a57..dfb0e412b62 100644 --- a/src/otx/hpo/hpo_base.py +++ b/src/otx/hpo/hpo_base.py @@ -36,7 +36,6 @@ class HpoBase(ABC): num_trials (int | None, optional): How many training to conduct for HPO. num_workers (int, optional): How many trains are executed in parallel. num_full_iterations (int, optional): epoch for traninig after HPO. - non_pure_train_ratio (float, optional): ratio of validation time to (train time + validation time) full_dataset_size (int, optional): train dataset size expected_time_ratio (int | float | None, optional): Time to use for HPO. If HPO is configured automatically, @@ -64,7 +63,6 @@ def __init__( num_trials: int | None = None, num_workers: int = 1, num_full_iterations: int | float = 1, - non_pure_train_ratio: float = 0.2, full_dataset_size: int = 0, expected_time_ratio: int | float | None = None, maximum_resource: int | float | None = None, @@ -78,12 +76,6 @@ def __init__( check_mode_input(mode) check_positive(full_dataset_size, "full_dataset_size") check_positive(num_full_iterations, "num_full_iterations") - if not 0 < non_pure_train_ratio <= 1: - error_msg = ( - "non_pure_train_ratio should be greater than 0 and lesser than or equal to 1." - f"Your value is {subset_ratio}" - ) - raise ValueError(error_msg) if maximum_resource is not None: check_positive(maximum_resource, "maximum_resource") if num_trials is not None: @@ -103,7 +95,6 @@ def __init__( self.num_trials = num_trials self.num_workers = num_workers self.num_full_iterations = num_full_iterations - self.non_pure_train_ratio = non_pure_train_ratio self.full_dataset_size = full_dataset_size self.expected_time_ratio = expected_time_ratio self.maximum_resource: int | float | None = maximum_resource From f3ec37e8047f69efd1572843233e46c285397374 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" Date: Fri, 22 Mar 2024 17:10:48 +0900 Subject: [PATCH 4/6] raise error once HPO trial exits abnormally --- src/otx/hpo/hpo_runner.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/otx/hpo/hpo_runner.py b/src/otx/hpo/hpo_runner.py index 2a936ff1a49..11494ed6de9 100644 --- a/src/otx/hpo/hpo_runner.py +++ b/src/otx/hpo/hpo_runner.py @@ -67,7 +67,6 @@ def __init__( self._mp = multiprocessing.get_context("spawn") self._report_queue = self._mp.Queue() self._uid_index = 0 - self._trial_fault_count = 0 self._resource_manager = get_resource_manager( resource_type, num_parallel_trial, @@ -83,7 +82,7 @@ def run(self) -> None: """Run a HPO loop.""" logger.info("HPO loop starts.") try: - while not self._hpo_algo.is_done() and self._trial_fault_count < 3: + while not self._hpo_algo.is_done(): if self._resource_manager.have_available_resource(): trial = self._hpo_algo.get_next_sample() if trial is not None: @@ -98,9 +97,6 @@ def run(self) -> None: raise e # noqa: TRY201 logger.info("HPO loop is done.") - if self._trial_fault_count >= 3: - logger.warning("HPO trials exited abnormally more than three times. HPO is suspended.") - self._get_reports() self._join_all_processes() @@ -143,7 +139,8 @@ def _remove_finished_process(self) -> None: for uid, trial in self._running_trials.items(): if not trial.process.is_alive(): if trial.process.exitcode != 0: - self._trial_fault_count += 1 + self._terminate_all_running_processes() + raise RuntimeError("A HPO trial exited abnormally.") trial.queue.close() trial.process.join() trial_to_remove.append(uid) From f4108d0a70debbb7c9a313c018923845370d5ff5 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" Date: Fri, 22 Mar 2024 17:18:44 +0900 Subject: [PATCH 5/6] update unit test --- tests/unit/hpo/test_hyperband.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/hpo/test_hyperband.py b/tests/unit/hpo/test_hyperband.py index 00dad36d58c..971f319005b 100644 --- a/tests/unit/hpo/test_hyperband.py +++ b/tests/unit/hpo/test_hyperband.py @@ -74,7 +74,6 @@ def good_hyperband_args(): "mode": "max", "num_workers": 1, "num_full_iterations": 64, - "non_pure_train_ratio": 0.2, "full_dataset_size": 100, "maximum_resource": 64, "minimum_resource": 1, From 86080f002dc6f62c2625e0822b2a7bea90f8c1a0 Mon Sep 17 00:00:00 2001 From: "Shin, Eunwoo" Date: Fri, 22 Mar 2024 17:20:11 +0900 Subject: [PATCH 6/6] align with pre-commit --- src/otx/hpo/hpo_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/otx/hpo/hpo_runner.py b/src/otx/hpo/hpo_runner.py index 11494ed6de9..f43064aeeed 100644 --- a/src/otx/hpo/hpo_runner.py +++ b/src/otx/hpo/hpo_runner.py @@ -140,7 +140,8 @@ def _remove_finished_process(self) -> None: if not trial.process.is_alive(): if trial.process.exitcode != 0: self._terminate_all_running_processes() - raise RuntimeError("A HPO trial exited abnormally.") + msg = "One of HPO trials exit abnormally." + raise RuntimeError(msg) trial.queue.close() trial.process.join() trial_to_remove.append(uid)