From 44c72b6676194dd8942b66acba1123772389fdc1 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Thu, 14 Mar 2024 17:26:33 +0900
Subject: [PATCH 1/6] change code using fixed subset name in hpo

---
 src/otx/core/model/module/detection.py             |  2 +-
 src/otx/core/model/module/instance_segmentation.py |  2 +-
 src/otx/engine/hpo/hpo_api.py                      | 11 +++++++++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/otx/core/model/module/detection.py b/src/otx/core/model/module/detection.py
index 8d5ea504ad0..d49956c6b2d 100644
--- a/src/otx/core/model/module/detection.py
+++ b/src/otx/core/model/module/detection.py
@@ -55,7 +55,7 @@ def load_state_dict(self, ckpt: dict[str, Any], *args, **kwargs) -> None:
         if "confidence_threshold" in ckpt:
             self.test_meta_info["best_confidence_threshold"] = ckpt["confidence_threshold"]
             self.test_meta_info["vary_confidence_threshold"] = False
-        elif "confidence_threshold" in ckpt["hyper_parameters"]:
+        elif "hyper_parameters" in ckpt and "confidence_threshold" in ckpt["hyper_parameters"]:
             self.test_meta_info["best_confidence_threshold"] = ckpt["hyper_parameters"]["confidence_threshold"]
             self.test_meta_info["vary_confidence_threshold"] = False
         super().load_state_dict(ckpt, *args, **kwargs)
diff --git a/src/otx/core/model/module/instance_segmentation.py b/src/otx/core/model/module/instance_segmentation.py
index 60aff90d0ca..b4c55777565 100644
--- a/src/otx/core/model/module/instance_segmentation.py
+++ b/src/otx/core/model/module/instance_segmentation.py
@@ -58,7 +58,7 @@ def load_state_dict(self, ckpt: dict[str, Any], *args, **kwargs) -> None:
         if "confidence_threshold" in ckpt:
             self.test_meta_info["best_confidence_threshold"] = ckpt["confidence_threshold"]
             self.test_meta_info["vary_confidence_threshold"] = False
-        elif "confidence_threshold" in ckpt["hyper_parameters"]:
+        elif "hyper_parameters" in ckpt and "confidence_threshold" in ckpt["hyper_parameters"]:
             self.test_meta_info["best_confidence_threshold"] = ckpt["hyper_parameters"]["confidence_threshold"]
             self.test_meta_info["vary_confidence_threshold"] = False
         super().load_state_dict(ckpt, *args, **kwargs)
diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py
index bcafb6039ae..43b96d74455 100644
--- a/src/otx/engine/hpo/hpo_api.py
+++ b/src/otx/engine/hpo/hpo_api.py
@@ -133,8 +133,15 @@ def hpo_config(self) -> dict[str, Any]:
 
     @hpo_config.setter
     def hpo_config(self, hpo_config: HpoConfig | None) -> None:
-        train_dataset_size = len(self._engine.datamodule.subsets["train"])
-        val_dataset_size = len(self._engine.datamodule.subsets["val"])
+        train_dataset_size = len(
+            self._engine.datamodule.subsets[self._engine.datamodule.config.train_subset.subset_name],
+        )
+        if self._engine.datamodule.config.val_subset.subset_name in self._engine.datamodule.subsets:
+            val_dataset_size = len(
+                self._engine.datamodule.subsets[self._engine.datamodule.config.val_subset.subset_name],
+            )
+        else:
+            val_dataset_size = 1
 
         self._hpo_config: dict[str, Any] = {  # default setting
             "save_path": str(self._hpo_workdir),

From 39bb69f480047100acf076d8c22efa2ca0b4f721 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Mon, 18 Mar 2024 11:32:08 +0900
Subject: [PATCH 2/6] apply comment

---
 src/otx/engine/hpo/hpo_api.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py
index 43b96d74455..4624bf565c6 100644
--- a/src/otx/engine/hpo/hpo_api.py
+++ b/src/otx/engine/hpo/hpo_api.py
@@ -133,13 +133,9 @@ def hpo_config(self) -> dict[str, Any]:
 
     @hpo_config.setter
     def hpo_config(self, hpo_config: HpoConfig | None) -> None:
-        train_dataset_size = len(
-            self._engine.datamodule.subsets[self._engine.datamodule.config.train_subset.subset_name],
-        )
+        train_dataset_size = len(self._engine.datamodule.train_dataloader())
         if self._engine.datamodule.config.val_subset.subset_name in self._engine.datamodule.subsets:
-            val_dataset_size = len(
-                self._engine.datamodule.subsets[self._engine.datamodule.config.val_subset.subset_name],
-            )
+            val_dataset_size = len(self._engine.datamodule.val_dataloader())
         else:
             val_dataset_size = 1
 

From 7e5b14dc1c60c48ca32336c4859befc531c13b82 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 22 Mar 2024 17:10:21 +0900
Subject: [PATCH 3/6] remove non_pure_train_ratio from HPO

---
 src/otx/engine/hpo/hpo_api.py | 5 -----
 src/otx/hpo/hpo_base.py       | 9 ---------
 2 files changed, 14 deletions(-)

diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py
index 4624bf565c6..e45b88e06af 100644
--- a/src/otx/engine/hpo/hpo_api.py
+++ b/src/otx/engine/hpo/hpo_api.py
@@ -134,16 +134,11 @@ def hpo_config(self) -> dict[str, Any]:
     @hpo_config.setter
     def hpo_config(self, hpo_config: HpoConfig | None) -> None:
         train_dataset_size = len(self._engine.datamodule.train_dataloader())
-        if self._engine.datamodule.config.val_subset.subset_name in self._engine.datamodule.subsets:
-            val_dataset_size = len(self._engine.datamodule.val_dataloader())
-        else:
-            val_dataset_size = 1
 
         self._hpo_config: dict[str, Any] = {  # default setting
             "save_path": str(self._hpo_workdir),
             "num_full_iterations": self._max_epoch,
             "full_dataset_size": train_dataset_size,
-            "non_pure_train_ratio": val_dataset_size / (train_dataset_size + val_dataset_size),
         }
 
         if hpo_config is not None:
diff --git a/src/otx/hpo/hpo_base.py b/src/otx/hpo/hpo_base.py
index f0e03452a57..dfb0e412b62 100644
--- a/src/otx/hpo/hpo_base.py
+++ b/src/otx/hpo/hpo_base.py
@@ -36,7 +36,6 @@ class HpoBase(ABC):
         num_trials (int | None, optional): How many training to conduct for HPO.
         num_workers (int, optional): How many trains are executed in parallel.
         num_full_iterations (int, optional): epoch for traninig after HPO.
-        non_pure_train_ratio (float, optional): ratio of validation time to (train time + validation time)
         full_dataset_size (int, optional): train dataset size
         expected_time_ratio (int | float | None, optional): Time to use for HPO.
                                                             If HPO is configured automatically,
@@ -64,7 +63,6 @@ def __init__(
         num_trials: int | None = None,
         num_workers: int = 1,
         num_full_iterations: int | float = 1,
-        non_pure_train_ratio: float = 0.2,
         full_dataset_size: int = 0,
         expected_time_ratio: int | float | None = None,
         maximum_resource: int | float | None = None,
@@ -78,12 +76,6 @@ def __init__(
         check_mode_input(mode)
         check_positive(full_dataset_size, "full_dataset_size")
         check_positive(num_full_iterations, "num_full_iterations")
-        if not 0 < non_pure_train_ratio <= 1:
-            error_msg = (
-                "non_pure_train_ratio should be greater than 0 and lesser than or equal to 1."
-                f"Your value is {subset_ratio}"
-            )
-            raise ValueError(error_msg)
         if maximum_resource is not None:
             check_positive(maximum_resource, "maximum_resource")
         if num_trials is not None:
@@ -103,7 +95,6 @@ def __init__(
         self.num_trials = num_trials
         self.num_workers = num_workers
         self.num_full_iterations = num_full_iterations
-        self.non_pure_train_ratio = non_pure_train_ratio
         self.full_dataset_size = full_dataset_size
         self.expected_time_ratio = expected_time_ratio
         self.maximum_resource: int | float | None = maximum_resource

From f3ec37e8047f69efd1572843233e46c285397374 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 22 Mar 2024 17:10:48 +0900
Subject: [PATCH 4/6] raise error once HPO trial exits abnormally

---
 src/otx/hpo/hpo_runner.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/otx/hpo/hpo_runner.py b/src/otx/hpo/hpo_runner.py
index 2a936ff1a49..11494ed6de9 100644
--- a/src/otx/hpo/hpo_runner.py
+++ b/src/otx/hpo/hpo_runner.py
@@ -67,7 +67,6 @@ def __init__(
         self._mp = multiprocessing.get_context("spawn")
         self._report_queue = self._mp.Queue()
         self._uid_index = 0
-        self._trial_fault_count = 0
         self._resource_manager = get_resource_manager(
             resource_type,
             num_parallel_trial,
@@ -83,7 +82,7 @@ def run(self) -> None:
         """Run a HPO loop."""
         logger.info("HPO loop starts.")
         try:
-            while not self._hpo_algo.is_done() and self._trial_fault_count < 3:
+            while not self._hpo_algo.is_done():
                 if self._resource_manager.have_available_resource():
                     trial = self._hpo_algo.get_next_sample()
                     if trial is not None:
@@ -98,9 +97,6 @@ def run(self) -> None:
             raise e  # noqa: TRY201
         logger.info("HPO loop is done.")
 
-        if self._trial_fault_count >= 3:
-            logger.warning("HPO trials exited abnormally more than three times. HPO is suspended.")
-
         self._get_reports()
         self._join_all_processes()
 
@@ -143,7 +139,8 @@ def _remove_finished_process(self) -> None:
         for uid, trial in self._running_trials.items():
             if not trial.process.is_alive():
                 if trial.process.exitcode != 0:
-                    self._trial_fault_count += 1
+                    self._terminate_all_running_processes()
+                    raise RuntimeError("A HPO trial exited abnormally.")
                 trial.queue.close()
                 trial.process.join()
                 trial_to_remove.append(uid)

From f4108d0a70debbb7c9a313c018923845370d5ff5 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 22 Mar 2024 17:18:44 +0900
Subject: [PATCH 5/6] update unit test

---
 tests/unit/hpo/test_hyperband.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/unit/hpo/test_hyperband.py b/tests/unit/hpo/test_hyperband.py
index 00dad36d58c..971f319005b 100644
--- a/tests/unit/hpo/test_hyperband.py
+++ b/tests/unit/hpo/test_hyperband.py
@@ -74,7 +74,6 @@ def good_hyperband_args():
             "mode": "max",
             "num_workers": 1,
             "num_full_iterations": 64,
-            "non_pure_train_ratio": 0.2,
             "full_dataset_size": 100,
             "maximum_resource": 64,
             "minimum_resource": 1,

From 86080f002dc6f62c2625e0822b2a7bea90f8c1a0 Mon Sep 17 00:00:00 2001
From: "Shin, Eunwoo" <eunwoo.shin@intel.com>
Date: Fri, 22 Mar 2024 17:20:11 +0900
Subject: [PATCH 6/6] align with pre-commit

---
 src/otx/hpo/hpo_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/otx/hpo/hpo_runner.py b/src/otx/hpo/hpo_runner.py
index 11494ed6de9..f43064aeeed 100644
--- a/src/otx/hpo/hpo_runner.py
+++ b/src/otx/hpo/hpo_runner.py
@@ -140,7 +140,8 @@ def _remove_finished_process(self) -> None:
             if not trial.process.is_alive():
                 if trial.process.exitcode != 0:
                     self._terminate_all_running_processes()
-                    raise RuntimeError("A HPO trial exited abnormally.")
+                    msg = "One of HPO trials exit abnormally."
+                    raise RuntimeError(msg)
                 trial.queue.close()
                 trial.process.join()
                 trial_to_remove.append(uid)