microsoft · Louis-J · Oct 12, 2022 · May 5, 2022 · May 5, 2022 · May 5, 2022
diff --git a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
@@ -648,8 +648,11 @@ def handle_trial_end(self, data):
             event: the job's state
             hyper_params: the hyperparameters (a string) generated and returned by tuner
         """
-        logger.debug('Tuner handle trial end, result is %s', data)
         hyper_params = nni.load(data['hyper_params'])
+        if self.is_created_in_previous_exp(hyper_params['parameter_id']):
+            # The end of the recovered trial is ignored
+            return
+        logger.debug('Tuner handle trial end, result is %s', data)
         self._handle_trial_end(hyper_params['parameter_id'])
         if data['trial_job_id'] in self.job_id_para_id_map:
             del self.job_id_para_id_map[data['trial_job_id']]
@@ -695,6 +698,13 @@ def handle_report_metric_data(self, data):
         ValueError
             Data type not supported
         """
+        if self.is_created_in_previous_exp(data['parameter_id']):
+            if data['type'] == MetricType.FINAL:
+                # only deal with final metric using import data
+                param = self.get_previous_param(data['parameter_id'])
+                trial_data = [{'parameter': param, 'value': nni.load(data['value'])}]
+                self.handle_import_data(trial_data)
+            return
         logger.debug('handle report metric data = %s', data)
         if 'value' in data:
             data['value'] = nni.load(data['value'])
@@ -752,7 +762,10 @@ def handle_report_metric_data(self, data):
                     'Data type not supported: {}'.format(data['type']))
 
     def handle_add_customized_trial(self, data):
-        pass
+        global _next_parameter_id
+        # data: parameters
+        previous_max_param_id = self.recover_parameter_id(data)
+        _next_parameter_id = previous_max_param_id + 1
 
     def handle_import_data(self, data):
         """Import additional data for tuning

diff --git a/nni/algorithms/hpo/hyperband_advisor.py b/nni/algorithms/hpo/hyperband_advisor.py
@@ -522,6 +522,9 @@ def handle_trial_end(self, data):
             hyper_params: the hyperparameters (a string) generated and returned by tuner
         """
         hyper_params = nni.load(data['hyper_params'])
+        if self.is_created_in_previous_exp(hyper_params['parameter_id']):
+            # The end of the recovered trial is ignored
+            return
         self._handle_trial_end(hyper_params['parameter_id'])
         if data['trial_job_id'] in self.job_id_para_id_map:
             del self.job_id_para_id_map[data['trial_job_id']]
@@ -538,6 +541,9 @@ def handle_report_metric_data(self, data):
         ValueError
             Data type not supported
         """
+        if self.is_created_in_previous_exp(data['parameter_id']):
+            # do not support recovering the algorithm state
+            return
         if 'value' in data:
             data['value'] = nni.load(data['value'])
         # multiphase? need to check
@@ -576,7 +582,10 @@ def handle_report_metric_data(self, data):
                 raise ValueError('Data type not supported: {}'.format(data['type']))
 
     def handle_add_customized_trial(self, data):
-        pass
+        global _next_parameter_id
+        # data: parameters
+        previous_max_param_id = self.recover_parameter_id(data)
+        _next_parameter_id = previous_max_param_id + 1
 
     def handle_import_data(self, data):
         pass
diff --git a/nni/algorithms/hpo/tpe_tuner.py b/nni/algorithms/hpo/tpe_tuner.py
@@ -218,19 +218,6 @@ def import_data(self, data):  # for resuming experiment
                 self.dedup.add_history(param)
         _logger.info(f'Replayed {len(data)} FINISHED trials')
 
-    def import_customized_data(self, data): # for dedup customized / resumed
-        if isinstance(data, str):
-            data = nni.load(data)
-
-        for trial in data:
-            # {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...}
-            if isinstance(trial, str):
-                trial = nni.load(trial)
-            param = format_parameters(trial['parameters'], self.space)
-            self._running_params[trial['parameter_id']] = param
-            self.dedup.add_history(param)
-        _logger.info(f'Replayed {len(data)} RUNING/WAITING trials')
-
 def suggest(args, rng, space, history):
     params = {}
     for key, spec in space.items():

diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py
@@ -60,11 +60,16 @@ def __init__(self, url: str):
         self.final_metric_callback: Optional[Callable[[int, MetricData], None]] = None
 
         self.parameters_count = 0
-
         # Sometimes messages arrive first before the callbacks get registered.
         # Or in case that we allow engine to be absent during the experiment.
         # Here we need to store the messages and invoke them later.
         self.call_queue: List[Tuple[str, list]] = []
+        # this is for waiting the to-be-recovered trials from nnimanager
+        self._advisor_initialized = False
+
+    @property
+    def initialized(self):
+        return self._advisor_initialized
 
     def register_callbacks(self, callbacks: Dict[str, Callable[..., None]]):
         """
@@ -212,10 +217,22 @@ def handle_update_search_space(self, data):
         self.search_space = data
 
     def handle_trial_end(self, data):
+        # TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring
+        id_ = nni.load(data['hyper_params'])['parameter_id']
+        if self.is_created_in_previous_exp(id_):
+            _logger.info('The end of the recovered trial %d is ignored', id_)
+            return
         _logger.debug('Trial end: %s', data)
-        self.invoke_callback('trial_end', nni.load(data['hyper_params'])['parameter_id'], data['event'] == 'SUCCEEDED')
+        self.invoke_callback('trial_end', id_, data['event'] == 'SUCCEEDED')
 
     def handle_report_metric_data(self, data):
+        # TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring
+        if self.is_created_in_previous_exp(data['parameter_id']):
+            _logger.info('The metrics of the recovered trial %d are ignored', data['parameter_id'])
+            return
+        # NOTE: this part is not aligned with hpo tuners.
+        # in hpo tuners, trial_job_id is used for intermediate results handling
+        # parameter_id is for final result handling.
         _logger.debug('Metric reported: %s', data)
         if data['type'] == MetricType.REQUEST_PARAMETER:
             raise ValueError('Request parameter not supported')
@@ -239,4 +256,6 @@ def handle_import_data(self, data):
         pass
 
     def handle_add_customized_trial(self, data):
-        pass
+        previous_max_param_id = self.recover_parameter_id(data)
+        self.parameters_count = previous_max_param_id
+        self._advisor_initialized = True
diff --git a/nni/nas/execution/common/integration_api.py b/nni/nas/execution/common/integration_api.py
@@ -6,12 +6,16 @@
     '_advisor'  # FIXME: hack to make it importable for tests
 ]
 
+import logging
+import time
 import warnings
 from typing import NewType, Any
 
 import nni
 from nni.common.version import version_check
 
+_logger = logging.getLogger(__name__)
+
 # NOTE: this is only for passing flake8, we cannot import RetiariiAdvisor
 # because it would induce cycled import
 RetiariiAdvisor = NewType('RetiariiAdvisor', Any)
@@ -41,7 +45,11 @@ def send_trial(parameters: dict, placement_constraint=None) -> int:
     Send a new trial. Executed on tuner end.
     Return a ID that is the unique identifier for this trial.
     """
-    return get_advisor().send_trial(parameters, placement_constraint)
+    advisor = get_advisor()
+    while not advisor.initialized:
+        _logger.info('Wait for RetiariiAdvisor to be initialized...')
+        time.sleep(0.5)
+    return advisor.send_trial(parameters, placement_constraint)
 
 def receive_trial_parameters() -> dict:
     """

diff --git a/nni/recoverable.py b/nni/recoverable.py
@@ -4,8 +4,12 @@
 from __future__ import annotations
 
 import os
+import nni
 
 class Recoverable:
+    def __init__(self):
+        self.recovered_max_param_id = -1
+        self.recovered_trial_params = {}
 
     def load_checkpoint(self) -> None:
         pass
@@ -18,3 +22,29 @@ def get_checkpoint_path(self) -> str | None:
         if ckp_path is not None and os.path.isdir(ckp_path):
             return ckp_path
         return None
+
+    def recover_parameter_id(self, data) -> int:
+        # this is for handling the resuming of the interrupted data: parameters
+        if not isinstance(data, list):
+            data = [data]
+
+        previous_max_param_id = 0
+        for trial in data:
+            # {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...}
+            if isinstance(trial, str):
+                trial = nni.load(trial)
+            if not isinstance(trial['parameter_id'], int):
+                # for dealing with user customized trials
+                # skip for now
+                continue
+            self.recovered_trial_params[trial['parameter_id']] = trial['parameters']
+            if previous_max_param_id < trial['parameter_id']:
+                previous_max_param_id = trial['parameter_id']
+        self.recovered_max_param_id = previous_max_param_id
+        return previous_max_param_id
+
+    def is_created_in_previous_exp(self, param_id: int) -> bool:
+        return param_id <= self.recovered_max_param_id
+
+    def get_previous_param(self, param_id: int) -> dict:
+        return self.recovered_trial_params[param_id]
diff --git a/nni/runtime/msg_dispatcher.py b/nni/runtime/msg_dispatcher.py
@@ -120,15 +120,10 @@ def handle_import_data(self, data):
         self.tuner.import_data(data)
 
     def handle_add_customized_trial(self, data):
+        global _next_parameter_id
         # data: parameters
-        if not isinstance(data, list):
-            data = [data]
-
-        for _ in data:
-            id_ = _create_parameter_id()
-            _customized_parameter_ids.add(id_)
-
-        self.tuner.import_customized_data(data)
+        previous_max_param_id = self.recover_parameter_id(data)
+        _next_parameter_id = previous_max_param_id + 1
 
     def handle_report_metric_data(self, data):
         """
@@ -137,6 +132,13 @@ def handle_report_metric_data(self, data):
               - 'value': metric value reported by nni.report_final_result()
               - 'type': report type, support {'FINAL', 'PERIODICAL'}
         """
+        if self.is_created_in_previous_exp(data['parameter_id']):
+            if data['type'] == MetricType.FINAL:
+                # only deal with final metric using import data
+                param = self.get_previous_param(data['parameter_id'])
+                trial_data = [{'parameter': param, 'value': load(data['value'])}]
+                self.handle_import_data(trial_data)
+            return
         # metrics value is dumped as json string in trial, so we need to decode it here
         if 'value' in data:
             data['value'] = load(data['value'])
@@ -166,14 +168,18 @@ def handle_trial_end(self, data):
              - event: the job's state
              - hyper_params: the hyperparameters generated and returned by tuner
         """
+        id_ = load(data['hyper_params'])['parameter_id']
+        if self.is_created_in_previous_exp(id_):
+            # The end of the recovered trial is ignored
+            return
         trial_job_id = data['trial_job_id']
         _ended_trials.add(trial_job_id)
         if trial_job_id in _trial_history:
             _trial_history.pop(trial_job_id)
             if self.assessor is not None:
                 self.assessor.trial_end(trial_job_id, data['event'] == 'SUCCEEDED')
         if self.tuner is not None:
-            self.tuner.trial_end(load(data['hyper_params'])['parameter_id'], data['event'] == 'SUCCEEDED')
+            self.tuner.trial_end(id_, data['event'] == 'SUCCEEDED')
 
     def _handle_final_metric_data(self, data):
         """Call tuner to process final results

diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py
@@ -30,6 +30,7 @@ class MsgDispatcherBase(Recoverable):
     """
 
     def __init__(self, command_channel_url=None):
+        super().__init__()
         self.stopping = False
         if command_channel_url is None:
             command_channel_url = dispatcher_env_vars.NNI_TUNER_COMMAND_CHANNEL

diff --git a/nni/tuner.py b/nni/tuner.py
@@ -219,14 +219,6 @@ def import_data(self, data: list[TrialRecord]) -> None:
         # data: a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
         pass
 
-    def import_customized_data(self, data: list[TrialRecord]) -> None:
-        """
-        Internal API under revising, not recommended for end users.
-        """
-        # Import resume data for avoiding duplications
-        # data: a list of dictionarys, each of which has at least two keys, 'parameter_id' and 'parameters'
-        pass
-
     def _on_exit(self) -> None:
         pass