From e766a22d82ff1200be6ab94802db204021a7877e Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Thu, 5 May 2022 09:28:08 +0800
Subject: [PATCH 01/77] update

---
 nni/retiarii/experiment/pytorch.py | 66 ++++++++++--------------------
 1 file changed, 22 insertions(+), 44 deletions(-)

diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 5d1ba969cb..0d81f3ffc8 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -11,7 +11,7 @@
 from pathlib import Path
 from subprocess import Popen
 from threading import Thread
-from typing import Any, List, Optional, Union, cast
+from typing import Any, List, Optional, Union, cast, overload
 
 import colorama
 import psutil
@@ -396,32 +396,29 @@ def _construct_devices(self):
     def _create_dispatcher(self):
         return self._dispatcher
 
+    @overload
     def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debug: bool = False) -> None:
+        ...
+    def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None:
         """
         Run the experiment.
         This function will block until experiment finish or error.
         """
-        if isinstance(self.evaluator, BaseOneShotTrainer):
-            # TODO: will throw a deprecation warning soon
-            # warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. '
-            #               'We will try to convert this trainer to our new implementation to run the algorithm. '
-            #               'In case you want to stick to the old implementation, '
-            #               'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning)
-            self.evaluator.fit()
-
-        if config is None:
-            warnings.warn('config = None is deprecate in future. If you are running a one-shot experiment, '
-                          'please consider creating a config and set execution engine to `oneshot`.', DeprecationWarning)
-            config = RetiariiExeConfig()
-            config.execution_engine = 'oneshot'
-
-        if config.execution_engine == 'oneshot':
+        if not isinstance(port, int):
+            assert port is None or isinstance(port, RetiariiExeConfig)
+            warnings.warn('Passing `config` in run() is deprecated.')
+            if port is None:
+                config = RetiariiExeConfig()
+                config.execution_engine = 'oneshot'
+                self.config = config
+            else:
+                self.config = port # for backward compatibility, will remove in future release
+        
+        if self.config.execution_engine == 'oneshot':
             base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
-            assert config is not None, 'You are using classic search mode, config cannot be None!'
-            self.config = config
-            self.start(port, debug)
+            super().run(port, wait_completion, debug)
 
     def _check_exp_status(self) -> bool:
         """
@@ -453,40 +450,21 @@ def stop(self) -> None:
         """
         Stop background experiment.
         """
-        _logger.info('Stopping experiment, please wait...')
-        atexit.unregister(self.stop)
-
+        _logger.info('To stop experiment...')
         # stop strategy first
         if self._dispatcher_thread is not None:
             self._dispatcher.stopping = True
             self._dispatcher_thread.join(timeout=1)
-
-        if self.id is not None:
-            nni.runtime.log.stop_experiment_log(self.id)
-        if self._proc is not None:
-            try:
-                # this if is to deal with the situation that
-                # nnimanager is cleaned up by ctrl+c first
-                if self._proc.poll() is None:
-                    rest.delete(self.port, '/experiment')
-            except Exception as e:
-                _logger.exception(e)
-                _logger.warning('Cannot gracefully stop experiment, killing NNI process...')
-                kill_command(self._proc.pid)
-
-        if self._pipe is not None:
-            self._pipe.close()
-
-        self.id = cast(str, None)
-        self.port = cast(int, None)
-        self._proc = None
-        self._pipe = None
+        
         self._dispatcher = cast(RetiariiAdvisor, None)
         self._dispatcher_thread = None
-        _logger.info('Experiment stopped')
+
+        super().stop()
 
     def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', formatter: str = 'dict') -> Any:
         """
+        TODO: the base class may also need this method
+
         Export several top performing models.
 
         For one-shot algorithms, only top-1 is supported. For others, ``optimize_mode`` and ``formatter`` are

From e0749678532ac41ab017188193e6bd6a16b213c2 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Thu, 5 May 2022 09:54:48 +0800
Subject: [PATCH 02/77] update

---
 nni/retiarii/experiment/pytorch.py | 58 +-----------------------------
 1 file changed, 1 insertion(+), 57 deletions(-)

diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 0d873b8ae0..8718a5bd6d 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -316,9 +316,7 @@ def start(self, port: int = 8080, debug: bool = False) -> None:
         debug
             Whether to start in debug mode.
         """
-        atexit.register(self.stop)
-
-        self.config = self.config.canonical_copy()
+        super().start(port, debug)
 
         # we will probably need a execution engine factory to make this clean and elegant
         if self.config.execution_engine == 'base':
@@ -345,43 +343,15 @@ def start(self, port: int = 8080, debug: bool = False) -> None:
             raise ValueError(f'Unsupported engine type: {self.config.execution_engine}')
         set_execution_engine(engine)
 
-        self.id = management.generate_experiment_id()
-
-        if self.config.experiment_working_directory is not None:
-            log_dir = Path(self.config.experiment_working_directory, self.id, 'log')
-        else:
-            log_dir = Path.home() / f'nni-experiments/{self.id}/log'
-        nni.runtime.log.start_experiment_log(self.id, log_dir, debug)
-
-        ws_url = f'ws://localhost:{port}/tuner'
-        self._proc = launcher.start_experiment('create', self.id, self.config, port, debug,  # type: ignore
-                                               RunMode.Background, None, ws_url, ['retiarii'])
-        assert self._proc is not None
-        connect_websocket(ws_url)
-
-        self.port = port  # port will be None if start up failed
-
         # dispatcher must be launched after pipe initialized
         # the logic to launch dispatcher in background should be refactored into dispatcher api
         self._dispatcher = self._create_dispatcher()
         self._dispatcher_thread = Thread(target=self._dispatcher.run)
         self._dispatcher_thread.start()
 
-        ips = [self.config.nni_manager_ip]
-        for interfaces in psutil.net_if_addrs().values():
-            for interface in interfaces:
-                if interface.family == socket.AF_INET:
-                    ips.append(interface.address)
-        ips = [f'http://{ip}:{port}' for ip in ips if ip]
-        msg = 'Web UI URLs: ' + colorama.Fore.CYAN + ' '.join(ips) + colorama.Style.RESET_ALL
-        _logger.info(msg)
-
-        exp_status_checker = Thread(target=self._check_exp_status)
-        exp_status_checker.start()
         self._start_strategy()
         # TODO: the experiment should be completed, when strategy exits and there is no running job
         _logger.info('Waiting for experiment to become DONE (you can ctrl+c if there is no running trial jobs)...')
-        exp_status_checker.join()
 
     def _construct_devices(self):
         devices = []
@@ -421,32 +391,6 @@ def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = Fals
         else:
             super().run(port, wait_completion, debug)
 
-    def _check_exp_status(self) -> bool:
-        """
-        Run the experiment.
-        This function will block until experiment finish or error.
-        Return `True` when experiment done; or return `False` when experiment failed.
-        """
-        assert self._proc is not None
-        try:
-            while True:
-                time.sleep(10)
-                # this if is to deal with the situation that
-                # nnimanager is cleaned up by ctrl+c first
-                if self._proc.poll() is None:
-                    status = self.get_status()
-                else:
-                    return False
-                if status == 'DONE' or status == 'STOPPED':
-                    return True
-                if status == 'ERROR':
-                    return False
-        except KeyboardInterrupt:
-            _logger.warning('KeyboardInterrupt detected')
-        finally:
-            self.stop()
-        raise RuntimeError('Check experiment status failed.')
-
     def stop(self) -> None:
         """
         Stop background experiment.

From 1f4eeeaf6cfba9e91d3a41dd7c442fa2cab6ecbb Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Thu, 5 May 2022 19:48:44 +0800
Subject: [PATCH 03/77] update

---
 nni/experiment/experiment.py       | 49 ++++++++-------
 nni/retiarii/experiment/pytorch.py | 97 +++++++++++++++++++-----------
 2 files changed, 91 insertions(+), 55 deletions(-)

diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index f514e4bdea..5cd8052369 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -87,26 +87,13 @@ def __init__(self, config_or_platform: ExperimentConfig | str | list[str] | None
         else:
             self.config = config_or_platform
 
-    def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None:
-        """
-        Start the experiment in background.
-
-        This method will raise exception on failure.
-        If it returns, the experiment should have been successfully started.
-
-        Parameters
-        ----------
-        port
-            The port of web UI.
-        debug
-            Whether to start in debug mode.
-        """
+    def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig:
         assert self.config is not None
         if run_mode is not RunMode.Detach:
             atexit.register(self.stop)
 
         config = self.config.canonical_copy()
-        if config.use_annotation:
+        if hasattr(config, "use_annotation") and config.use_annotation: # will be refactored
             raise RuntimeError('NNI annotation is not supported by Python experiment API.')
 
         if config.experiment_working_directory is not None:
@@ -114,13 +101,10 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
         else:  # this should never happen in latest version, keep it until v2.7 for potential compatibility
             log_dir = Path.home() / f'nni-experiments/{self.id}/log'
         nni.runtime.log.start_experiment_log(self.id, log_dir, debug)
+        return config
 
-        self._proc = launcher.start_experiment(self._action, self.id, config, port, debug, run_mode, self.url_prefix)
-        assert self._proc is not None
-
-        self.port = port  # port will be None if start up failed
-
-        ips = [config.nni_manager_ip]
+    def _start_end(self, port: int, nni_manager_ip: str) -> None:
+        ips = [nni_manager_ip]
         for interfaces in psutil.net_if_addrs().values():
             for interface in interfaces:
                 if interface.family == socket.AF_INET:
@@ -129,6 +113,29 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
         msg = 'Web portal URLs: ' + colorama.Fore.CYAN + ' '.join(ips) + colorama.Style.RESET_ALL
         _logger.info(msg)
 
+    def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None:
+        """
+        Start the experiment in background.
+
+        This method will raise exception on failure.
+        If it returns, the experiment should have been successfully started.
+
+        Parameters
+        ----------
+        port
+            The port of web UI.
+        debug
+            Whether to start in debug mode.
+        """
+        config = self._start_begin(debug, run_mode)
+
+        self._proc = launcher.start_experiment(self._action, self.id, config, port, debug, run_mode, self.url_prefix)
+        assert self._proc is not None
+
+        self.port = port  # port will be None if start up failed
+
+        self._start_end(port, config.nni_manager_ip)
+
     def stop(self) -> None:
         """
         Stop the experiment.
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 8718a5bd6d..620573e89c 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -1,11 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import atexit
+from __future__ import annotations
+
 import logging
 import os
-import socket
-import time
 import warnings
 from dataclasses import dataclass
 from pathlib import Path
@@ -14,24 +13,22 @@
 from typing import Any, List, Optional, Union, cast, overload
 
 import colorama
-import psutil
 import torch
 import torch.nn as nn
-import nni.runtime.log
 from nni.common.device import GPUDevice
-from nni.experiment import Experiment, RunMode, launcher, management, rest
+from nni.experiment import Experiment, RunMode
 from nni.experiment.config import utils
 from nni.experiment.config.base import ConfigBase
 from nni.experiment.config.training_service import TrainingServiceConfig
 from nni.experiment.config.training_services import RemoteConfig
 from nni.runtime.protocol import connect_websocket
-from nni.tools.nnictl.command_utils import kill_command
 
 from ..codegen import model_to_pytorch_script
 from ..converter import convert_to_graph
 from ..converter.graph_gen import GraphConverterWithShape
 from ..execution import list_models, set_execution_engine
 from ..execution.utils import get_mutation_dict
+from ..execution.interface import AbstractExecutionEngine
 from ..graph import Evaluator
 from ..integration import RetiariiAdvisor
 from ..mutator import Mutator
@@ -304,20 +301,7 @@ def _start_strategy(self):
         # TODO: find out a proper way to show no more trial message on WebUI
         # self._dispatcher.mark_experiment_as_ending()
 
-    def start(self, port: int = 8080, debug: bool = False) -> None:
-        """
-        Start the experiment in background.
-        This method will raise exception on failure.
-        If it returns, the experiment should have been successfully started.
-        Parameters
-        ----------
-        port
-            The port of web UI.
-        debug
-            Whether to start in debug mode.
-        """
-        super().start(port, debug)
-
+    def _create_execution_engine(self) -> AbstractExecutionEngine:
         # we will probably need a execution engine factory to make this clean and elegant
         if self.config.execution_engine == 'base':
             from ..execution.base import BaseExecutionEngine
@@ -341,6 +325,32 @@ def start(self, port: int = 8080, debug: bool = False) -> None:
             engine = BenchmarkExecutionEngine(self.config.benchmark)
         else:
             raise ValueError(f'Unsupported engine type: {self.config.execution_engine}')
+        return engine
+
+    def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None:
+        """
+        Start the experiment in background.
+        This method will raise exception on failure.
+        If it returns, the experiment should have been successfully started.
+        Parameters
+        ----------
+        port
+            The port of web UI.
+        debug
+            Whether to start in debug mode.
+        """
+        config = self._start_begin(debug, run_mode)
+
+        ws_url = f'ws://localhost:{port}/tuner'
+        self._proc = launcher.start_experiment('create', self.id, config, port, debug,  # type: ignore
+                                               RunMode.Background, None, ws_url, ['retiarii'])
+        assert self._proc is not None
+        connect_websocket(ws_url)
+        self.port = port  # port will be None if start up failed
+
+        self._start_end(port, config.nni_manager_ip)
+
+        engine = self._create_execution_engine()
         set_execution_engine(engine)
 
         # dispatcher must be launched after pipe initialized
@@ -367,29 +377,25 @@ def _construct_devices(self):
     def _create_dispatcher(self):
         return self._dispatcher
 
-    @overload
     def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debug: bool = False) -> None:
-        ...
-    def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None:
         """
         Run the experiment.
         This function will block until experiment finish or error.
         """
-        if not isinstance(port, int):
-            assert port is None or isinstance(port, RetiariiExeConfig)
-            warnings.warn('Passing `config` in run() is deprecated.')
-            if port is None:
-                config = RetiariiExeConfig()
-                config.execution_engine = 'oneshot'
-                self.config = config
-            else:
-                self.config = port # for backward compatibility, will remove in future release
+        assert port is None or isinstance(port, RetiariiExeConfig)
+        warnings.warn('Passing `config` in run() is deprecated.')
+        if port is None:
+            config = RetiariiExeConfig()
+            config.execution_engine = 'oneshot'
+            self.config = config
+        else:
+            self.config = port # for backward compatibility, will remove in future release
         
         if self.config.execution_engine == 'oneshot':
             base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
-            super().run(port, wait_completion, debug)
+            super().run(port, True, debug)
 
     def stop(self) -> None:
         """
@@ -451,3 +457,26 @@ def retrain_model(self, model):
         this function retrains the exported model, and test it to output test accuracy
         """
         raise NotImplementedError
+
+
+class NasExperiment(RetiariiExperiment):
+    """
+    This class is only a new interface wrapper.
+    """
+    def __init__(self, model: nn.Module,
+                 evaluator: Union[BaseOneShotTrainer, Evaluator],
+                 strategy: BaseStrategy,
+                 config_or_platform: ExperimentConfig | str | list[str] | None = 'local',
+                 mutators: List[Mutator] = cast(List[Mutator], None)):
+        ...
+
+    def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None:
+        """
+        Run the experiment.
+        This function will block until experiment finish or error.
+        """
+        if self.config.execution_engine == 'oneshot':
+            base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
+            self.strategy.run(base_model_ir, self.applied_mutators)
+        else:
+            super().run(port, wait_completion, debug)
\ No newline at end of file

From b9c788b819d29f63996508f5819d7ac4bbe0d1ad Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Fri, 6 May 2022 19:14:18 +0800
Subject: [PATCH 04/77] update

---
 nni/experiment/config/base.py                 |   2 +
 nni/experiment/config/experiment_config.py    |   9 +-
 nni/experiment/experiment.py                  |   7 +-
 nni/retiarii/execution/cgo_engine.py          |  19 ++-
 nni/retiarii/experiment/__init__.py           |   2 +
 nni/retiarii/experiment/config/__init__.py    |   5 +
 .../experiment/config/engine_config.py        |  43 +++++
 .../experiment/config/experiment_config.py    |  67 ++++++++
 nni/retiarii/experiment/pytorch.py            | 157 +++++-------------
 9 files changed, 187 insertions(+), 124 deletions(-)
 create mode 100644 nni/retiarii/experiment/config/__init__.py
 create mode 100644 nni/retiarii/experiment/config/engine_config.py
 create mode 100644 nni/retiarii/experiment/config/experiment_config.py

diff --git a/nni/experiment/config/base.py b/nni/experiment/config/base.py
index f3d44e063f..a2de758315 100644
--- a/nni/experiment/config/base.py
+++ b/nni/experiment/config/base.py
@@ -158,7 +158,9 @@ def canonical_copy(self):
             A deep copy.
         """
         canon = copy.deepcopy(self)
+        print(type(canon))
         canon._canonicalize([])
+        print(type(canon))
         canon._validate_canonical()
         return canon
 
diff --git a/nni/experiment/config/experiment_config.py b/nni/experiment/config/experiment_config.py
index 20216d7c21..e8791c0bf7 100644
--- a/nni/experiment/config/experiment_config.py
+++ b/nni/experiment/config/experiment_config.py
@@ -141,7 +141,7 @@ def _canonicalize(self, _parents):
                 msg = f'nni_manager_ip is not set, please make sure {ip} is accessible from training machines'
                 logging.getLogger('nni.experiment.config').warning(msg)
 
-    def _validate_canonical(self):
+    def _validate_canonical(self, validate_tuner: bool = True): # FIXME: remove validate_tuner
         super()._validate_canonical()
 
         space_cnt = (self.search_space is not None) + (self.search_space_file is not None)
@@ -164,10 +164,11 @@ def _validate_canonical(self):
         # currently I have only seen one issue of this kind
         #Path(self.experiment_working_directory).mkdir(parents=True, exist_ok=True)
 
-        utils.validate_gpu_indices(self.tuner_gpu_indices)
+        if validate_tuner:
+            utils.validate_gpu_indices(self.tuner_gpu_indices)
 
-        if self.tuner is None:
-            raise ValueError('ExperimentConfig: tuner must be set')
+            if self.tuner is None:
+                raise ValueError('ExperimentConfig: tuner must be set')
 
 def _load_search_space_file(search_space_path):
     # FIXME
diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index 5cd8052369..524f169b8c 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -10,7 +10,7 @@
 import socket
 from subprocess import Popen
 import time
-from typing import Any
+from typing import Any, Optional
 
 import colorama
 import psutil
@@ -92,8 +92,9 @@ def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig:
         if run_mode is not RunMode.Detach:
             atexit.register(self.stop)
 
+        print(type(self.config))
         config = self.config.canonical_copy()
-        if hasattr(config, "use_annotation") and config.use_annotation: # will be refactored
+        if hasattr(config, "use_annotation") and config.use_annotation: #TODO: will be refactored
             raise RuntimeError('NNI annotation is not supported by Python experiment API.')
 
         if config.experiment_working_directory is not None:
@@ -103,7 +104,7 @@ def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig:
         nni.runtime.log.start_experiment_log(self.id, log_dir, debug)
         return config
 
-    def _start_end(self, port: int, nni_manager_ip: str) -> None:
+    def _start_end(self, port: int, nni_manager_ip: Optional[str]) -> None:
         ips = [nni_manager_ip]
         for interfaces in psutil.net_if_addrs().values():
             for interface in interfaces:
diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py
index 4ba11987a1..509708cf54 100644
--- a/nni/retiarii/execution/cgo_engine.py
+++ b/nni/retiarii/execution/cgo_engine.py
@@ -7,10 +7,11 @@
 import string
 import time
 import threading
-from typing import Iterable, List, Dict, Tuple
+from typing import Iterable, List, Dict, Tuple, cast
 from dataclasses import dataclass
 
 from nni.common.device import GPUDevice, Device
+from nni.experiment.config.training_services import RemoteConfig
 from .interface import AbstractExecutionEngine, AbstractGraphListener, WorkerInfo
 from .. import codegen, utils
 from ..graph import Model, ModelStatus, MetricData, Node
@@ -31,7 +32,6 @@ class TrialSubmission:
     placement: Dict[Node, Device]
     grouped_models: List[Model]
 
-
 class CGOExecutionEngine(AbstractExecutionEngine):
     """
     The execution engine with Cross-Graph Optimization (CGO).
@@ -50,7 +50,7 @@ class CGOExecutionEngine(AbstractExecutionEngine):
         The trials within one batch could apply cross-graph optimization.
     """
 
-    def __init__(self, devices: List[Device] = None,
+    def __init__(self, training_service,
                  max_concurrency: int = None,
                  batch_waiting_time: int = 60,
                  ) -> None:
@@ -59,6 +59,8 @@ def __init__(self, devices: List[Device] = None,
         self.logical_plan_counter = 0
         self.available_devices: List[Device] = []
         self.max_concurrency: int = max_concurrency
+
+        devices = self._construct_devices(training_service)
         for device in devices:
             self.available_devices.append(device)
         self.all_devices = self.available_devices.copy()
@@ -88,6 +90,17 @@ def __init__(self, devices: List[Device] = None,
         self._consumer_thread = threading.Thread(target=self._consume_models)
         self._consumer_thread.start()
 
+    def _construct_devices(self, training_service):
+        devices = []
+        if hasattr(training_service, 'machine_list'):
+            for machine in cast(RemoteConfig, training_service).machine_list:
+                assert machine.gpu_indices is not None, \
+                    'gpu_indices must be set in RemoteMachineConfig for CGO execution engine'
+                assert isinstance(machine.gpu_indices, list), 'gpu_indices must be a list'
+                for gpu_idx in machine.gpu_indices:
+                    devices.append(GPUDevice(machine.host, gpu_idx))
+        return devices
+
     def join(self):
         self._stopped = True
         self._consumer_thread.join()
diff --git a/nni/retiarii/experiment/__init__.py b/nni/retiarii/experiment/__init__.py
index e69de29bb2..0eca6426d9 100644
--- a/nni/retiarii/experiment/__init__.py
+++ b/nni/retiarii/experiment/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
\ No newline at end of file
diff --git a/nni/retiarii/experiment/config/__init__.py b/nni/retiarii/experiment/config/__init__.py
new file mode 100644
index 0000000000..38bc427477
--- /dev/null
+++ b/nni/retiarii/experiment/config/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .experiment_config import *
+from .engine_config import *
\ No newline at end of file
diff --git a/nni/retiarii/experiment/config/engine_config.py b/nni/retiarii/experiment/config/engine_config.py
new file mode 100644
index 0000000000..042e227012
--- /dev/null
+++ b/nni/retiarii/experiment/config/engine_config.py
@@ -0,0 +1,43 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from dataclasses import dataclass
+from typing import Optional, List
+
+from nni.experiment.config.base import ConfigBase
+
+__all__ = ['ExecutionEngineConfig', 'BaseEngineConfig', 'OneshotEngineConfig',
+           'PyEngineConfig', 'CgoEngineConfig', 'BenchmarkEngineConfig']
+
+@dataclass(init=False)
+class ExecutionEngineConfig(ConfigBase):
+    """
+    """
+    name: str
+
+@dataclass(init=False)
+class PyEngineConfig(ExecutionEngineConfig):
+    name: str = 'py'
+
+@dataclass(init=False)
+class OneshotEngineConfig(ExecutionEngineConfig):
+    name: str = 'oneshot'
+
+@dataclass(init=False)
+class BaseEngineConfig(ExecutionEngineConfig):
+    name: str = 'base'
+    # input used in GraphConverterWithShape. Currently support shape tuple only.
+    dummy_input: Optional[List[int]] = None
+
+@dataclass(init=False)
+class CgoEngineConfig(ExecutionEngineConfig):
+    name: str = 'cgo'
+    max_concurrency_cgo: Optional[int] = None
+    batch_waiting_time: Optional[int] = None
+    # input used in GraphConverterWithShape. Currently support shape tuple only.
+    dummy_input: Optional[List[int]] = None
+
+@dataclass(init=False)
+class BenchmarkEngineConfig(ExecutionEngineConfig):
+    name: str = 'benchmark'
+    benchmark: Optional[str] = None
\ No newline at end of file
diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
new file mode 100644
index 0000000000..69d7185220
--- /dev/null
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+
+from nni.experiment.config import utils, ExperimentConfig
+
+from .engine_config import ExecutionEngineConfig, PyEngineConfig
+
+__all__ = ['RetiariiExeConfig']
+
+def execution_engine_config_factory(engine_name):
+    # FIXME: may move this function to experiment utils in future
+    cls = _get_ee_config_class(engine_name)
+    if cls is None:
+        raise ValueError(f'Invalid execution engine name: {engine_name}')
+    return cls()
+
+def _get_ee_config_class(engine_name):
+    for cls in ExecutionEngineConfig.__subclasses__():
+        if cls.name == engine_name:
+            return cls
+    return None
+
+@dataclass(init=False)
+class RetiariiExeConfig(ExperimentConfig):
+    # FIXME: refactor this class to inherit from a new common base class with HPO config
+    search_space: Any = ''
+    trial_code_directory: utils.PathLike = '.'
+    trial_command: str = '_reserved'
+
+    execution_engine: ExecutionEngineConfig = PyEngineConfig()
+
+    def __init__(self, training_service_platform: Optional[str] = None,
+                 execution_engine: Union[str, ExecutionEngineConfig] = None, #TODO: having default value or not?
+                 **kwargs):
+        super().__init__(training_service_platform, **kwargs)
+
+        if execution_engine is not None:
+            # the user chose to init with `config = ExperimentConfig('local')` and set fields later
+            # we need to create empty training service & algorithm configs to support `config.tuner.name = 'random'`
+            assert utils.is_missing(self.execution_engine)
+            if isinstance(execution_engine, str):
+                self.execution_engine = execution_engine_config_factory(execution_engine)
+            else:
+                self.execution_engine = execution_engine
+
+        self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name
+
+    def __setattr__(self, key, value):
+        #TODO: tuner settings can also be blocked here
+        fixed_attrs = {'search_space': '',
+                       'trial_command': '_reserved'}
+        if key in fixed_attrs and fixed_attrs[key] != value:
+            raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!')
+        # 'trial_code_directory' is handled differently because the path will be converted to absolute path by us
+        if key == 'trial_code_directory' and not (str(value) == '.' or os.path.isabs(value)):
+            raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!')
+        #if key == 'execution_engine':
+        #    assert value in ['base', 'py', 'cgo', 'benchmark', 'oneshot'], f'The specified execution engine "{value}" is not supported.'
+        #    self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + value
+        super().__setattr__(key, value) #TODO: double check whether new fields are validated
+
+    def _validate_canonical(self):
+        super()._validate_canonical(False)
\ No newline at end of file
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 620573e89c..919026a589 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -4,25 +4,25 @@
 from __future__ import annotations
 
 import logging
-import os
+
 import warnings
-from dataclasses import dataclass
-from pathlib import Path
 from subprocess import Popen
 from threading import Thread
-from typing import Any, List, Optional, Union, cast, overload
+from typing import Any, List, Optional, Union, cast
 
 import colorama
+import psutil
+from typing_extensions import Literal
+
 import torch
 import torch.nn as nn
-from nni.common.device import GPUDevice
-from nni.experiment import Experiment, RunMode
-from nni.experiment.config import utils
-from nni.experiment.config.base import ConfigBase
-from nni.experiment.config.training_service import TrainingServiceConfig
-from nni.experiment.config.training_services import RemoteConfig
+from nni.retiarii.experiment.config.engine_config import *
+import nni.runtime.log
+from nni.experiment import Experiment, RunMode, launcher, management
+from nni.experiment.config import ExperimentConfig
 from nni.runtime.protocol import connect_websocket
 
+from .config import RetiariiExeConfig, OneshotEngineConfig
 from ..codegen import model_to_pytorch_script
 from ..converter import convert_to_graph
 from ..converter.graph_gen import GraphConverterWithShape
@@ -43,79 +43,7 @@
 _logger = logging.getLogger(__name__)
 
 
-__all__ = ['RetiariiExeConfig', 'RetiariiExperiment']
-
-
-@dataclass(init=False)
-class RetiariiExeConfig(ConfigBase):
-    experiment_name: Optional[str] = None
-    search_space: Any = ''  # TODO: remove
-    trial_command: str = '_reserved'
-    trial_code_directory: utils.PathLike = '.'
-    trial_concurrency: int
-    trial_gpu_number: int = 0
-    devices: Optional[List[Union[str, GPUDevice]]] = None
-    max_experiment_duration: Optional[str] = None
-    max_trial_number: Optional[int] = None
-    max_concurrency_cgo: Optional[int] = None
-    batch_waiting_time: Optional[int] = None
-    nni_manager_ip: Optional[str] = None
-    debug: bool = False
-    log_level: str = 'info'
-    experiment_working_directory: utils.PathLike = '~/nni-experiments'
-    # remove configuration of tuner/assessor/advisor
-    training_service: TrainingServiceConfig
-    execution_engine: str = 'py'
-
-    # input used in GraphConverterWithShape. Currently support shape tuple only.
-    dummy_input: Optional[List[int]] = None
-
-    # input used for benchmark engine.
-    benchmark: Optional[str] = None
-
-    def __init__(self, training_service_platform: Optional[str] = None, **kwargs):
-        super().__init__(**kwargs)
-        if training_service_platform is not None:
-            assert 'training_service' not in kwargs
-            self.training_service = utils.training_service_config_factory(platform=training_service_platform)
-        self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry py'
-
-    def __setattr__(self, key, value):
-        fixed_attrs = {'search_space': '',
-                       'trial_command': '_reserved'}
-        if key in fixed_attrs and fixed_attrs[key] != value:
-            raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!')
-        # 'trial_code_directory' is handled differently because the path will be converted to absolute path by us
-        if key == 'trial_code_directory' and not (str(value) == '.' or os.path.isabs(value)):
-            raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!')
-        if key == 'execution_engine':
-            assert value in ['base', 'py', 'cgo', 'benchmark', 'oneshot'], f'The specified execution engine "{value}" is not supported.'
-            self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + value
-        self.__dict__[key] = value
-
-    def validate(self, initialized_tuner: bool = False) -> None:
-        super().validate()
-
-    @property
-    def _canonical_rules(self):
-        return _canonical_rules
-
-    @property
-    def _validation_rules(self):
-        return _validation_rules
-
-
-_canonical_rules = {
-}
-
-_validation_rules = {
-    'trial_code_directory': lambda value: (Path(value).is_dir(), f'"{value}" does not exist or is not directory'),
-    'trial_concurrency': lambda value: value > 0,
-    'trial_gpu_number': lambda value: value >= 0,
-    'max_trial_number': lambda value: value > 0,
-    'log_level': lambda value: value in ["trace", "debug", "info", "warning", "error", "fatal"],
-    'training_service': lambda value: (type(value) is not TrainingServiceConfig, 'cannot be abstract base class')
-}
+__all__ = ['RetiariiExperiment', 'NasExperiment']
 
 
 def preprocess_model(base_model, evaluator, applied_mutators, full_ir=True, dummy_input=None, oneshot=False):
@@ -252,6 +180,8 @@ class RetiariiExperiment(Experiment):
     def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, Evaluator] = cast(Evaluator, None),
                  applied_mutators: List[Mutator] = cast(List[Mutator], None), strategy: BaseStrategy = cast(BaseStrategy, None),
                  trainer: BaseOneShotTrainer = cast(BaseOneShotTrainer, None)):
+        nni.runtime.log.init_logger_for_command_line()
+
         if trainer is not None:
             warnings.warn('Usage of `trainer` in RetiariiExperiment is deprecated and will be removed soon. '
                           'Please consider specifying it as a positional argument, or use `evaluator`.', DeprecationWarning)
@@ -260,6 +190,13 @@ def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, E
         if evaluator is None:
             raise ValueError('Evaluator should not be none.')
 
+        self.config: RetiariiExeConfig | None = None
+        self.id: str = management.generate_experiment_id()
+        self.port: int | None = None
+        self._proc: Popen | psutil.Process | None = None
+        self._action: Literal['create', 'resume', 'view'] = 'create'
+        self.url_prefix: str | None = None
+
         # TODO: The current design of init interface of Retiarii experiment needs to be reviewed.
         self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None)
         self.port: Optional[int] = None
@@ -289,8 +226,8 @@ def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, E
     def _start_strategy(self):
         base_model_ir, self.applied_mutators = preprocess_model(
             self.base_model, self.evaluator, self.applied_mutators,
-            full_ir=self.config.execution_engine not in ['py', 'benchmark'],
-            dummy_input=self.config.dummy_input
+            full_ir=not isinstance(self.config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)),
+            dummy_input=self.config.execution_engine.dummy_input if hasattr(self.config.execution_engine, 'dummy_input') else None
         )
 
         _logger.info('Start strategy...')
@@ -303,23 +240,22 @@ def _start_strategy(self):
 
     def _create_execution_engine(self) -> AbstractExecutionEngine:
         # we will probably need a execution engine factory to make this clean and elegant
-        if self.config.execution_engine == 'base':
+        if isinstance(self.config.execution_engine, BaseEngineConfig):
             from ..execution.base import BaseExecutionEngine
             engine = BaseExecutionEngine()
-        elif self.config.execution_engine == 'cgo':
+        elif isinstance(self.config.execution_engine, CgoEngineConfig):
             from ..execution.cgo_engine import CGOExecutionEngine
 
             assert self.config.training_service.platform == 'remote', \
                 "CGO execution engine currently only supports remote training service"
             assert self.config.batch_waiting_time is not None and self.config.max_concurrency_cgo is not None
-            devices = self._construct_devices()
-            engine = CGOExecutionEngine(devices,
+            engine = CGOExecutionEngine(self.config.training_service,
                                         max_concurrency=self.config.max_concurrency_cgo,
                                         batch_waiting_time=self.config.batch_waiting_time)
-        elif self.config.execution_engine == 'py':
+        elif isinstance(self.config.execution_engine, PyEngineConfig):
             from ..execution.python import PurePythonExecutionEngine
             engine = PurePythonExecutionEngine()
-        elif self.config.execution_engine == 'benchmark':
+        elif isinstance(self.config.execution_engine, BenchmarkEngineConfig):
             from ..execution.benchmark import BenchmarkExecutionEngine
             assert self.config.benchmark is not None, '"benchmark" must be set when benchmark execution engine is used.'
             engine = BenchmarkExecutionEngine(self.config.benchmark)
@@ -363,17 +299,6 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
         # TODO: the experiment should be completed, when strategy exits and there is no running job
         _logger.info('Waiting for experiment to become DONE (you can ctrl+c if there is no running trial jobs)...')
 
-    def _construct_devices(self):
-        devices = []
-        if hasattr(self.config.training_service, 'machine_list'):
-            for machine in cast(RemoteConfig, self.config.training_service).machine_list:
-                assert machine.gpu_indices is not None, \
-                    'gpu_indices must be set in RemoteMachineConfig for CGO execution engine'
-                assert isinstance(machine.gpu_indices, list), 'gpu_indices must be a list'
-                for gpu_idx in machine.gpu_indices:
-                    devices.append(GPUDevice(machine.host, gpu_idx))
-        return devices
-
     def _create_dispatcher(self):
         return self._dispatcher
 
@@ -382,16 +307,20 @@ def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debu
         Run the experiment.
         This function will block until experiment finish or error.
         """
-        assert port is None or isinstance(port, RetiariiExeConfig)
-        warnings.warn('Passing `config` in run() is deprecated.')
-        if port is None:
+        if isinstance(self.evaluator, BaseOneShotTrainer):
+            # TODO: will throw a deprecation warning soon
+            # warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. '
+            #               'We will try to convert this trainer to our new implementation to run the algorithm. '
+            #               'In case you want to stick to the old implementation, '
+            #               'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning)
+            self.evaluator.fit()
+
+        if config is None:
             config = RetiariiExeConfig()
-            config.execution_engine = 'oneshot'
-            self.config = config
-        else:
-            self.config = port # for backward compatibility, will remove in future release
-        
-        if self.config.execution_engine == 'oneshot':
+            config.execution_engine = OneshotEngineConfig()
+        self.config = config
+
+        if self.config.execution_engine.name == 'oneshot':
             base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
@@ -406,7 +335,7 @@ def stop(self) -> None:
         if self._dispatcher_thread is not None:
             self._dispatcher.stopping = True
             self._dispatcher_thread.join(timeout=1)
-        
+
         self._dispatcher = cast(RetiariiAdvisor, None)
         self._dispatcher_thread = None
 
@@ -434,7 +363,7 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for
             If ``dict``, the mutation history will be returned.
         """
         if formatter == 'code':
-            assert self.config.execution_engine != 'py', 'You should use `dict` formatter when using Python execution engine.'
+            assert not isinstance(self.config.execution_engine, PyEngineConfig), 'You should use `dict` formatter when using Python execution engine.'
         if isinstance(self.evaluator, BaseOneShotTrainer):
             assert top_k == 1, 'Only support top_k is 1 for now.'
             return self.evaluator.export()
@@ -475,7 +404,7 @@ def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = Fals
         Run the experiment.
         This function will block until experiment finish or error.
         """
-        if self.config.execution_engine == 'oneshot':
+        if self.config.execution_engine.name == 'oneshot': #TODO
             base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:

From 1e97e042bb3dcf0856e0c64a0ef5d045de3493ea Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Fri, 6 May 2022 19:59:00 +0800
Subject: [PATCH 05/77] update

---
 .../experiment/config/experiment_config.py    |  2 +-
 nni/retiarii/experiment/pytorch.py            | 45 +++++++------------
 2 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
index 69d7185220..4c3e57caf0 100644
--- a/nni/retiarii/experiment/config/experiment_config.py
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -30,7 +30,7 @@ class RetiariiExeConfig(ExperimentConfig):
     search_space: Any = ''
     trial_code_directory: utils.PathLike = '.'
     trial_command: str = '_reserved'
-
+    # new config field for NAS
     execution_engine: ExecutionEngineConfig = PyEngineConfig()
 
     def __init__(self, training_service_platform: Optional[str] = None,
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 919026a589..283ae982a8 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -177,8 +177,10 @@ class RetiariiExperiment(Experiment):
     ...     final_model = Net()
     """
 
-    def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, Evaluator] = cast(Evaluator, None),
-                 applied_mutators: List[Mutator] = cast(List[Mutator], None), strategy: BaseStrategy = cast(BaseStrategy, None),
+    def __init__(self, base_model: nn.Module,
+                 evaluator: Union[BaseOneShotTrainer, Evaluator] = cast(Evaluator, None),
+                 applied_mutators: List[Mutator] = cast(List[Mutator], None),
+                 strategy: BaseStrategy = cast(BaseStrategy, None),
                  trainer: BaseOneShotTrainer = cast(BaseOneShotTrainer, None)):
         nni.runtime.log.init_logger_for_command_line()
 
@@ -196,8 +198,6 @@ def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, E
         self._proc: Popen | psutil.Process | None = None
         self._action: Literal['create', 'resume', 'view'] = 'create'
         self.url_prefix: str | None = None
-
-        # TODO: The current design of init interface of Retiarii experiment needs to be reviewed.
         self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None)
         self.port: Optional[int] = None
 
@@ -206,16 +206,6 @@ def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, E
         self.applied_mutators = applied_mutators
         self.strategy = strategy
 
-        from nni.retiarii.oneshot.pytorch.strategy import OneShotStrategy
-        if not isinstance(strategy, OneShotStrategy):
-            self._dispatcher = RetiariiAdvisor()
-        else:
-            self._dispatcher = cast(RetiariiAdvisor, None)
-        self._dispatcher_thread: Optional[Thread] = None
-        self._proc: Optional[Popen] = None
-
-        self.url_prefix = None
-
         # check for sanity
         if not is_model_wrapped(base_model):
             warnings.warn(colorama.Style.BRIGHT + colorama.Fore.RED +
@@ -238,8 +228,8 @@ def _start_strategy(self):
         # TODO: find out a proper way to show no more trial message on WebUI
         # self._dispatcher.mark_experiment_as_ending()
 
-    def _create_execution_engine(self) -> AbstractExecutionEngine:
-        # we will probably need a execution engine factory to make this clean and elegant
+    def _create_execution_engine(self) -> None:
+        #TODO: we will probably need a execution engine factory to make this clean and elegant
         if isinstance(self.config.execution_engine, BaseEngineConfig):
             from ..execution.base import BaseExecutionEngine
             engine = BaseExecutionEngine()
@@ -261,7 +251,7 @@ def _create_execution_engine(self) -> AbstractExecutionEngine:
             engine = BenchmarkExecutionEngine(self.config.benchmark)
         else:
             raise ValueError(f'Unsupported engine type: {self.config.execution_engine}')
-        return engine
+        set_execution_engine(engine)
 
     def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None:
         """
@@ -286,12 +276,11 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
 
         self._start_end(port, config.nni_manager_ip)
 
-        engine = self._create_execution_engine()
-        set_execution_engine(engine)
+        self._create_execution_engine() # FIXME: engine cannot be created twice
 
+        self._dispatcher = RetiariiAdvisor()
         # dispatcher must be launched after pipe initialized
         # the logic to launch dispatcher in background should be refactored into dispatcher api
-        self._dispatcher = self._create_dispatcher()
         self._dispatcher_thread = Thread(target=self._dispatcher.run)
         self._dispatcher_thread.start()
 
@@ -299,9 +288,6 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
         # TODO: the experiment should be completed, when strategy exits and there is no running job
         _logger.info('Waiting for experiment to become DONE (you can ctrl+c if there is no running trial jobs)...')
 
-    def _create_dispatcher(self):
-        return self._dispatcher
-
     def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debug: bool = False) -> None:
         """
         Run the experiment.
@@ -314,16 +300,16 @@ def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debu
             #               'In case you want to stick to the old implementation, '
             #               'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning)
             self.evaluator.fit()
+            return
 
         if config is None:
-            config = RetiariiExeConfig()
-            config.execution_engine = OneshotEngineConfig()
-        self.config = config
-
-        if self.config.execution_engine.name == 'oneshot':
+            self.config = RetiariiExeConfig()
+            self.config.execution_engine = OneshotEngineConfig()
             base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
+            # FIXME: oneshot strategy should also be executable on training services
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
+            self.config = config
             super().run(port, True, debug)
 
     def stop(self) -> None:
@@ -396,6 +382,7 @@ def __init__(self, model: nn.Module,
                  evaluator: Union[BaseOneShotTrainer, Evaluator],
                  strategy: BaseStrategy,
                  config_or_platform: ExperimentConfig | str | list[str] | None = 'local',
+                 execution_engine: Union[str, ExecutionEngineConfig] = 'py',
                  mutators: List[Mutator] = cast(List[Mutator], None)):
         ...
 
@@ -404,7 +391,7 @@ def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = Fals
         Run the experiment.
         This function will block until experiment finish or error.
         """
-        if self.config.execution_engine.name == 'oneshot': #TODO
+        if isinstance(self.config.execution_engine.name, OneshotEngineConfig):
             base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:

From 9e39e92a111db685891e1deb5f853a465b77e205 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Sat, 7 May 2022 13:33:04 +0800
Subject: [PATCH 06/77] runnable

---
 nni/experiment/experiment.py       | 31 +++++++++++++++++-------------
 nni/retiarii/experiment/pytorch.py | 26 ++++++++++++-------------
 ts/nni_manager/core/nnimanager.ts  |  2 +-
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index 524f169b8c..7f4402f55f 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -92,7 +92,6 @@ def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig:
         if run_mode is not RunMode.Detach:
             atexit.register(self.stop)
 
-        print(type(self.config))
         config = self.config.canonical_copy()
         if hasattr(config, "use_annotation") and config.use_annotation: #TODO: will be refactored
             raise RuntimeError('NNI annotation is not supported by Python experiment API.')
@@ -137,11 +136,7 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
 
         self._start_end(port, config.nni_manager_ip)
 
-    def stop(self) -> None:
-        """
-        Stop the experiment.
-        """
-        _logger.info('Stopping experiment, please wait...')
+    def _stop(self) -> None:
         atexit.unregister(self.stop)
 
         nni.runtime.log.stop_experiment_log(self.id)
@@ -156,8 +151,24 @@ def stop(self) -> None:
         self.id = None  # type: ignore
         self.port = None
         self._proc = None
+
+    def stop(self) -> None:
+        """
+        Stop the experiment.
+        """
+        _logger.info('Stopping experiment, please wait...')
+        self._stop()
         _logger.info('Experiment stopped')
 
+    def _wait_completion(self) -> None:
+        while True:
+            time.sleep(10)
+            status = self.get_status()
+            if status == 'DONE' or status == 'STOPPED':
+                return True
+            if status == 'ERROR':
+                return False
+
     def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None:
         """
         Run the experiment.
@@ -171,13 +182,7 @@ def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = Fals
         self.start(port, debug)
         if wait_completion:
             try:
-                while True:
-                    time.sleep(10)
-                    status = self.get_status()
-                    if status == 'DONE' or status == 'STOPPED':
-                        return True
-                    if status == 'ERROR':
-                        return False
+                self._wait_completion()
             except KeyboardInterrupt:
                 _logger.warning('KeyboardInterrupt detected')
                 self.stop()
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 283ae982a8..27ad37d3e7 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -276,17 +276,13 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
 
         self._start_end(port, config.nni_manager_ip)
 
-        self._create_execution_engine() # FIXME: engine cannot be created twice
-
         self._dispatcher = RetiariiAdvisor()
         # dispatcher must be launched after pipe initialized
         # the logic to launch dispatcher in background should be refactored into dispatcher api
         self._dispatcher_thread = Thread(target=self._dispatcher.run)
         self._dispatcher_thread.start()
 
-        self._start_strategy()
-        # TODO: the experiment should be completed, when strategy exits and there is no running job
-        _logger.info('Waiting for experiment to become DONE (you can ctrl+c if there is no running trial jobs)...')
+        self._create_execution_engine() # FIXME: engine cannot be created twice
 
     def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debug: bool = False) -> None:
         """
@@ -310,22 +306,24 @@ def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debu
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
             self.config = config
-            super().run(port, True, debug)
+            self.start(port, debug)
+            try:
+                self._start_strategy()
+                self._wait_completion()
+            except KeyboardInterrupt:
+                _logger.warning('KeyboardInterrupt detected')
+                self.stop()
 
     def stop(self) -> None:
         """
         Stop background experiment.
         """
-        _logger.info('To stop experiment...')
-        # stop strategy first
-        if self._dispatcher_thread is not None:
-            self._dispatcher.stopping = True
-            self._dispatcher_thread.join(timeout=1)
-
+        _logger.info('Stopping experiment, please wait...')
+        self._stop()
+        self._dispatcher_thread.join()
         self._dispatcher = cast(RetiariiAdvisor, None)
         self._dispatcher_thread = None
-
-        super().stop()
+        _logger.info('Experiment stopped')
 
     def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', formatter: str = 'dict') -> Any:
         """
diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts
index 7ad5a0130f..54a42760cc 100644
--- a/ts/nni_manager/core/nnimanager.ts
+++ b/ts/nni_manager/core/nnimanager.ts
@@ -303,8 +303,8 @@ class NNIManager implements Manager {
         }
 
         this.trainingService.removeTrialJobMetricListener(this.trialJobMetricListener);
+        this.dispatcher.sendCommand(TERMINATE);
         if (this.dispatcherPid > 0) {
-            this.dispatcher.sendCommand(TERMINATE);
             // gracefully terminate tuner and assessor here, wait at most 30 seconds.
             for (let i: number = 0; i < 30; i++) {
                 if (!await isAlive(this.dispatcherPid)) {

From 6ebd77430de26770d524452340866922737ec710 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 9 May 2022 19:56:57 +0800
Subject: [PATCH 07/77] update

---
 nni/experiment/experiment.py         |  2 +-
 nni/retiarii/execution/base.py       | 11 ++++++++---
 nni/retiarii/execution/cgo_engine.py |  5 +++++
 nni/retiarii/experiment/pytorch.py   | 22 +++++++++++++---------
 4 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index 7f4402f55f..b464e3d48a 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -162,12 +162,12 @@ def stop(self) -> None:
 
     def _wait_completion(self) -> None:
         while True:
-            time.sleep(10)
             status = self.get_status()
             if status == 'DONE' or status == 'STOPPED':
                 return True
             if status == 'ERROR':
                 return False
+            time.sleep(10)
 
     def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None:
         """
diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py
index d8cda6cc8a..d488ce1d4c 100644
--- a/nni/retiarii/execution/base.py
+++ b/nni/retiarii/execution/base.py
@@ -7,6 +7,8 @@
 import string
 from typing import Any, Dict, Iterable, List
 
+from nni.experiment import rest
+
 from .interface import AbstractExecutionEngine, AbstractGraphListener
 from .utils import get_mutation_summary
 from .. import codegen, utils
@@ -54,12 +56,15 @@ class BaseExecutionEngine(AbstractExecutionEngine):
     Resource management is implemented in this class.
     """
 
-    def __init__(self) -> None:
+    def __init__(self, rest_port: int = None, rest_url_prefix: str = None) -> None:
         """
         Upon initialization, advisor callbacks need to be registered.
         Advisor will call the callbacks when the corresponding event has been triggered.
         Base execution engine will get those callbacks and broadcast them to graph listener.
         """
+        self.port = rest_port
+        self.url_prefix = rest_url_prefix
+
         self._listeners: List[AbstractGraphListener] = []
 
         # register advisor callbacks
@@ -123,8 +128,8 @@ def query_available_resource(self) -> int:
         return self.resources
 
     def budget_exhausted(self) -> bool:
-        advisor = get_advisor()
-        return advisor.stopping
+        resp = rest.get(self.port, '/check-status', self.url_prefix)
+        return resp['status'] == 'DONE'
 
     @classmethod
     def pack_model_data(cls, model: Model) -> Any:
diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py
index 509708cf54..94459199dd 100644
--- a/nni/retiarii/execution/cgo_engine.py
+++ b/nni/retiarii/execution/cgo_engine.py
@@ -53,7 +53,12 @@ class CGOExecutionEngine(AbstractExecutionEngine):
     def __init__(self, training_service,
                  max_concurrency: int = None,
                  batch_waiting_time: int = 60,
+                 rest_port: int = None,
+                 rest_url_prefix: str = None
                  ) -> None:
+        self.port = rest_port
+        self.url_prefix = rest_url_prefix
+
         self._listeners: List[AbstractGraphListener] = []
         self._running_models: Dict[int, Model] = dict()
         self.logical_plan_counter = 0
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 27ad37d3e7..69a3ae5d6e 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -192,14 +192,12 @@ def __init__(self, base_model: nn.Module,
         if evaluator is None:
             raise ValueError('Evaluator should not be none.')
 
-        self.config: RetiariiExeConfig | None = None
         self.id: str = management.generate_experiment_id()
         self.port: int | None = None
         self._proc: Popen | psutil.Process | None = None
         self._action: Literal['create', 'resume', 'view'] = 'create'
         self.url_prefix: str | None = None
-        self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None)
-        self.port: Optional[int] = None
+        self.config: RetiariiExeConfig | None = cast(RetiariiExeConfig, None)
 
         self.base_model = base_model
         self.evaluator: Union[Evaluator, BaseOneShotTrainer] = evaluator
@@ -213,7 +211,7 @@ def __init__(self, base_model: nn.Module,
                           'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL,
                           RuntimeWarning)
 
-    def _start_strategy(self):
+    def _run_strategy(self):
         base_model_ir, self.applied_mutators = preprocess_model(
             self.base_model, self.evaluator, self.applied_mutators,
             full_ir=not isinstance(self.config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)),
@@ -232,7 +230,7 @@ def _create_execution_engine(self) -> None:
         #TODO: we will probably need a execution engine factory to make this clean and elegant
         if isinstance(self.config.execution_engine, BaseEngineConfig):
             from ..execution.base import BaseExecutionEngine
-            engine = BaseExecutionEngine()
+            engine = BaseExecutionEngine(self.port, self.url_prefix)
         elif isinstance(self.config.execution_engine, CgoEngineConfig):
             from ..execution.cgo_engine import CGOExecutionEngine
 
@@ -241,10 +239,12 @@ def _create_execution_engine(self) -> None:
             assert self.config.batch_waiting_time is not None and self.config.max_concurrency_cgo is not None
             engine = CGOExecutionEngine(self.config.training_service,
                                         max_concurrency=self.config.max_concurrency_cgo,
-                                        batch_waiting_time=self.config.batch_waiting_time)
+                                        batch_waiting_time=self.config.batch_waiting_time,
+                                        rest_port=self.port,
+                                        rest_url_prefix=self.url_prefix)
         elif isinstance(self.config.execution_engine, PyEngineConfig):
             from ..execution.python import PurePythonExecutionEngine
-            engine = PurePythonExecutionEngine()
+            engine = PurePythonExecutionEngine(self.port, self.url_prefix)
         elif isinstance(self.config.execution_engine, BenchmarkEngineConfig):
             from ..execution.benchmark import BenchmarkExecutionEngine
             assert self.config.benchmark is not None, '"benchmark" must be set when benchmark execution engine is used.'
@@ -284,7 +284,9 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
 
         self._create_execution_engine() # FIXME: engine cannot be created twice
 
-    def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debug: bool = False) -> None:
+    def run(self, config: Optional[RetiariiExeConfig] = None,
+            port: int = 8080,
+            debug: bool = False) -> None:
         """
         Run the experiment.
         This function will block until experiment finish or error.
@@ -308,11 +310,13 @@ def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debu
             self.config = config
             self.start(port, debug)
             try:
-                self._start_strategy()
+                self._run_strategy()
+                # FIXME: move this logic to strategy with a new API provided by execution engine
                 self._wait_completion()
             except KeyboardInterrupt:
                 _logger.warning('KeyboardInterrupt detected')
                 self.stop()
+            _logger.info('Search process is done, the experiment is still alive')
 
     def stop(self) -> None:
         """

From 81ff2469a59541427c347546927be50320866b5b Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 9 May 2022 19:59:56 +0800
Subject: [PATCH 08/77] update

---
 examples/nas/multi-trial/mnist/search.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/nas/multi-trial/mnist/search.py b/examples/nas/multi-trial/mnist/search.py
index 52d1007493..6ee65a70bb 100644
--- a/examples/nas/multi-trial/mnist/search.py
+++ b/examples/nas/multi-trial/mnist/search.py
@@ -131,7 +131,7 @@ def evaluate_model(model_cls):
     exp_config = RetiariiExeConfig('local')
     exp_config.experiment_name = 'mnist_search'
     exp_config.trial_concurrency = 2
-    exp_config.max_trial_number = 20
+    exp_config.max_trial_number = 4
     exp_config.training_service.use_active_gpu = False
     export_formatter = 'dict'
 
@@ -139,7 +139,8 @@ def evaluate_model(model_cls):
     # exp_config.execution_engine = 'base'
     # export_formatter = 'code'
 
-    exp.run(exp_config, 8080)
+    exp.run(exp_config, 8090)
     print('Final model:')
     for model_code in exp.export_top_models(formatter=export_formatter):
         print(model_code)
+    exp.stop()
\ No newline at end of file

From 5d3e68122f6058b2f751c3822283d785b9d6b33f Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 10 May 2022 14:45:28 +0800
Subject: [PATCH 09/77] fix pylint

---
 nni/retiarii/experiment/pytorch.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 69a3ae5d6e..e85c581251 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -16,19 +16,20 @@
 
 import torch
 import torch.nn as nn
-from nni.retiarii.experiment.config.engine_config import *
 import nni.runtime.log
 from nni.experiment import Experiment, RunMode, launcher, management
 from nni.experiment.config import ExperimentConfig
 from nni.runtime.protocol import connect_websocket
 
-from .config import RetiariiExeConfig, OneshotEngineConfig
+from .config import (
+    RetiariiExeConfig, ExecutionEngineConfig, OneshotEngineConfig, BaseEngineConfig,
+    PyEngineConfig, CgoEngineConfig, BenchmarkEngineConfig
+)
 from ..codegen import model_to_pytorch_script
 from ..converter import convert_to_graph
 from ..converter.graph_gen import GraphConverterWithShape
 from ..execution import list_models, set_execution_engine
 from ..execution.utils import get_mutation_dict
-from ..execution.interface import AbstractExecutionEngine
 from ..graph import Evaluator
 from ..integration import RetiariiAdvisor
 from ..mutator import Mutator
@@ -351,7 +352,8 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for
             If ``dict``, the mutation history will be returned.
         """
         if formatter == 'code':
-            assert not isinstance(self.config.execution_engine, PyEngineConfig), 'You should use `dict` formatter when using Python execution engine.'
+            assert not isinstance(self.config.execution_engine, PyEngineConfig), \
+                'You should use `dict` formatter when using Python execution engine.'
         if isinstance(self.evaluator, BaseOneShotTrainer):
             assert top_k == 1, 'Only support top_k is 1 for now.'
             return self.evaluator.export()

From 9c580d5e24e4d6f87aeb0e22deed6d930396af19 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 10 May 2022 18:03:51 +0800
Subject: [PATCH 10/77] fix pyright

---
 nni/experiment/experiment.py         |  2 +-
 nni/retiarii/execution/base.py       |  3 ++-
 nni/retiarii/execution/cgo_engine.py |  4 ++--
 nni/retiarii/experiment/pytorch.py   | 15 +++++++++------
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index b464e3d48a..4dc732eb15 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -160,7 +160,7 @@ def stop(self) -> None:
         self._stop()
         _logger.info('Experiment stopped')
 
-    def _wait_completion(self) -> None:
+    def _wait_completion(self) -> bool:
         while True:
             status = self.get_status()
             if status == 'DONE' or status == 'STOPPED':
diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py
index d488ce1d4c..8cfe4a315c 100644
--- a/nni/retiarii/execution/base.py
+++ b/nni/retiarii/execution/base.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+from __future__ import annotations
 
 import logging
 import os
@@ -56,7 +57,7 @@ class BaseExecutionEngine(AbstractExecutionEngine):
     Resource management is implemented in this class.
     """
 
-    def __init__(self, rest_port: int = None, rest_url_prefix: str = None) -> None:
+    def __init__(self, rest_port: int | None = None, rest_url_prefix: str = None) -> None:
         """
         Upon initialization, advisor callbacks need to be registered.
         Advisor will call the callbacks when the corresponding event has been triggered.
diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py
index 94459199dd..376d2afe5f 100644
--- a/nni/retiarii/execution/cgo_engine.py
+++ b/nni/retiarii/execution/cgo_engine.py
@@ -53,8 +53,8 @@ class CGOExecutionEngine(AbstractExecutionEngine):
     def __init__(self, training_service,
                  max_concurrency: int = None,
                  batch_waiting_time: int = 60,
-                 rest_port: int = None,
-                 rest_url_prefix: str = None
+                 rest_port: int | None = None,
+                 rest_url_prefix: str | None = None
                  ) -> None:
         self.port = rest_port
         self.url_prefix = rest_url_prefix
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index e85c581251..cf08c2fec8 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -198,7 +198,7 @@ def __init__(self, base_model: nn.Module,
         self._proc: Popen | psutil.Process | None = None
         self._action: Literal['create', 'resume', 'view'] = 'create'
         self.url_prefix: str | None = None
-        self.config: RetiariiExeConfig | None = cast(RetiariiExeConfig, None)
+        self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None)
 
         self.base_model = base_model
         self.evaluator: Union[Evaluator, BaseOneShotTrainer] = evaluator
@@ -216,7 +216,8 @@ def _run_strategy(self):
         base_model_ir, self.applied_mutators = preprocess_model(
             self.base_model, self.evaluator, self.applied_mutators,
             full_ir=not isinstance(self.config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)),
-            dummy_input=self.config.execution_engine.dummy_input if hasattr(self.config.execution_engine, 'dummy_input') else None
+            dummy_input=self.config.execution_engine.dummy_input
+                if isinstance(self.config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None
         )
 
         _logger.info('Start strategy...')
@@ -235,12 +236,14 @@ def _create_execution_engine(self) -> None:
         elif isinstance(self.config.execution_engine, CgoEngineConfig):
             from ..execution.cgo_engine import CGOExecutionEngine
 
-            assert self.config.training_service.platform == 'remote', \
+            assert not isinstance(self.config.training_service, list) \
+                and self.config.training_service.platform == 'remote', \
                 "CGO execution engine currently only supports remote training service"
-            assert self.config.batch_waiting_time is not None and self.config.max_concurrency_cgo is not None
+            assert self.config.execution_engine.batch_waiting_time is not None \
+                and self.config.execution_engine.max_concurrency_cgo is not None
             engine = CGOExecutionEngine(self.config.training_service,
-                                        max_concurrency=self.config.max_concurrency_cgo,
-                                        batch_waiting_time=self.config.batch_waiting_time,
+                                        max_concurrency=self.config.execution_engine.max_concurrency_cgo,
+                                        batch_waiting_time=self.config.execution_engine.batch_waiting_time,
                                         rest_port=self.port,
                                         rest_url_prefix=self.url_prefix)
         elif isinstance(self.config.execution_engine, PyEngineConfig):

From 1c2f6debf8e742f940f2ce2ce11c817df713a249 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 10 May 2022 18:16:35 +0800
Subject: [PATCH 11/77] update

---
 nni/experiment/config/base.py        | 2 --
 nni/experiment/experiment.py         | 2 +-
 nni/retiarii/execution/cgo_engine.py | 1 +
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/nni/experiment/config/base.py b/nni/experiment/config/base.py
index a2de758315..f3d44e063f 100644
--- a/nni/experiment/config/base.py
+++ b/nni/experiment/config/base.py
@@ -158,9 +158,7 @@ def canonical_copy(self):
             A deep copy.
         """
         canon = copy.deepcopy(self)
-        print(type(canon))
         canon._canonicalize([])
-        print(type(canon))
         canon._validate_canonical()
         return canon
 
diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index 4dc732eb15..3d06c7de34 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -93,7 +93,7 @@ def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig:
             atexit.register(self.stop)
 
         config = self.config.canonical_copy()
-        if hasattr(config, "use_annotation") and config.use_annotation: #TODO: will be refactored
+        if config.use_annotation:
             raise RuntimeError('NNI annotation is not supported by Python experiment API.')
 
         if config.experiment_working_directory is not None:
diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py
index 376d2afe5f..acf3fd3524 100644
--- a/nni/retiarii/execution/cgo_engine.py
+++ b/nni/retiarii/execution/cgo_engine.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+from __future__ import annotations
 
 import logging
 import os

From 5086e0a96f9b8206a942d40e4b04890862064188 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 10 May 2022 20:44:29 +0800
Subject: [PATCH 12/77] fix pyright

---
 .../experiment/config/experiment_config.py    |  5 ++--
 nni/retiarii/experiment/pytorch.py            | 25 ++++++++++---------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
index 4c3e57caf0..dcf9d1d9ad 100644
--- a/nni/retiarii/experiment/config/experiment_config.py
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+from __future__ import annotations
 
 import os
 from dataclasses import dataclass
@@ -33,8 +34,8 @@ class RetiariiExeConfig(ExperimentConfig):
     # new config field for NAS
     execution_engine: ExecutionEngineConfig = PyEngineConfig()
 
-    def __init__(self, training_service_platform: Optional[str] = None,
-                 execution_engine: Union[str, ExecutionEngineConfig] = None, #TODO: having default value or not?
+    def __init__(self, training_service_platform: str | None = None,
+                 execution_engine: str | ExecutionEngineConfig = PyEngineConfig(),
                  **kwargs):
         super().__init__(training_service_platform, **kwargs)
 
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index cf08c2fec8..6c92e965d2 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -251,8 +251,9 @@ def _create_execution_engine(self) -> None:
             engine = PurePythonExecutionEngine(self.port, self.url_prefix)
         elif isinstance(self.config.execution_engine, BenchmarkEngineConfig):
             from ..execution.benchmark import BenchmarkExecutionEngine
-            assert self.config.benchmark is not None, '"benchmark" must be set when benchmark execution engine is used.'
-            engine = BenchmarkExecutionEngine(self.config.benchmark)
+            assert self.config.execution_engine.benchmark is not None, \
+                '"benchmark" must be set when benchmark execution engine is used.'
+            engine = BenchmarkExecutionEngine(self.config.execution_engine.benchmark)
         else:
             raise ValueError(f'Unsupported engine type: {self.config.execution_engine}')
         set_execution_engine(engine)
@@ -328,7 +329,8 @@ def stop(self) -> None:
         """
         _logger.info('Stopping experiment, please wait...')
         self._stop()
-        self._dispatcher_thread.join()
+        if self._dispatcher_thread:
+            self._dispatcher_thread.join()
         self._dispatcher = cast(RetiariiAdvisor, None)
         self._dispatcher_thread = None
         _logger.info('Experiment stopped')
@@ -380,11 +382,11 @@ def retrain_model(self, model):
         """
         raise NotImplementedError
 
-
+"""
 class NasExperiment(RetiariiExperiment):
-    """
-    This class is only a new interface wrapper.
-    """
+
+    #This class is only a new interface wrapper.
+
     def __init__(self, model: nn.Module,
                  evaluator: Union[BaseOneShotTrainer, Evaluator],
                  strategy: BaseStrategy,
@@ -394,12 +396,11 @@ def __init__(self, model: nn.Module,
         ...
 
     def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None:
-        """
-        Run the experiment.
-        This function will block until experiment finish or error.
-        """
+        #Run the experiment.
+        #This function will block until experiment finish or error.
         if isinstance(self.config.execution_engine.name, OneshotEngineConfig):
             base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
-            super().run(port, wait_completion, debug)
\ No newline at end of file
+            super().run(port, wait_completion, debug)
+"""
\ No newline at end of file

From db9f4e4653a044ef6575797cba0860d4526bb6f9 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 10 May 2022 20:58:19 +0800
Subject: [PATCH 13/77] update

---
 nni/retiarii/execution/base.py                      | 2 +-
 nni/retiarii/experiment/config/experiment_config.py | 2 +-
 nni/retiarii/experiment/pytorch.py                  | 5 ++---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py
index 8cfe4a315c..869db9f1a5 100644
--- a/nni/retiarii/execution/base.py
+++ b/nni/retiarii/execution/base.py
@@ -57,7 +57,7 @@ class BaseExecutionEngine(AbstractExecutionEngine):
     Resource management is implemented in this class.
     """
 
-    def __init__(self, rest_port: int | None = None, rest_url_prefix: str = None) -> None:
+    def __init__(self, rest_port: int | None = None, rest_url_prefix: str | None = None) -> None:
         """
         Upon initialization, advisor callbacks need to be registered.
         Advisor will call the callbacks when the corresponding event has been triggered.
diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
index dcf9d1d9ad..456b404aa1 100644
--- a/nni/retiarii/experiment/config/experiment_config.py
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -4,7 +4,7 @@
 
 import os
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any
 
 from nni.experiment.config import utils, ExperimentConfig
 
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 6c92e965d2..6736ca2561 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -18,11 +18,10 @@
 import torch.nn as nn
 import nni.runtime.log
 from nni.experiment import Experiment, RunMode, launcher, management
-from nni.experiment.config import ExperimentConfig
 from nni.runtime.protocol import connect_websocket
 
 from .config import (
-    RetiariiExeConfig, ExecutionEngineConfig, OneshotEngineConfig, BaseEngineConfig,
+    RetiariiExeConfig, OneshotEngineConfig, BaseEngineConfig,
     PyEngineConfig, CgoEngineConfig, BenchmarkEngineConfig
 )
 from ..codegen import model_to_pytorch_script
@@ -44,7 +43,7 @@
 _logger = logging.getLogger(__name__)
 
 
-__all__ = ['RetiariiExperiment', 'NasExperiment']
+__all__ = ['RetiariiExperiment']
 
 
 def preprocess_model(base_model, evaluator, applied_mutators, full_ir=True, dummy_input=None, oneshot=False):

From 90971756c9a87a5ffd0a2c1f81121f66fe786c0f Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 10 May 2022 21:48:31 +0800
Subject: [PATCH 14/77] minor

---
 nni/experiment/experiment.py                    |  2 ++
 nni/retiarii/execution/base.py                  |  7 +++++++
 nni/retiarii/execution/cgo_engine.py            | 14 +++++++++-----
 nni/retiarii/experiment/config/engine_config.py |  2 --
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index 3d06c7de34..feef6677a3 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -126,6 +126,8 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
             The port of web UI.
         debug
             Whether to start in debug mode.
+        run_mode
+            Running the experiment in foreground or background
         """
         config = self._start_begin(debug, run_mode)
 
diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py
index 869db9f1a5..c35d357ad0 100644
--- a/nni/retiarii/execution/base.py
+++ b/nni/retiarii/execution/base.py
@@ -62,6 +62,13 @@ def __init__(self, rest_port: int | None = None, rest_url_prefix: str | None = N
         Upon initialization, advisor callbacks need to be registered.
         Advisor will call the callbacks when the corresponding event has been triggered.
         Base execution engine will get those callbacks and broadcast them to graph listener.
+
+        Parameters
+        ----------
+        rest_port
+            The port of the experiment's rest server
+        rest_url_prefix
+            The url prefix of the experiment's rest entry
         """
         self.port = rest_port
         self.url_prefix = rest_url_prefix
diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py
index acf3fd3524..f2d149a1d8 100644
--- a/nni/retiarii/execution/cgo_engine.py
+++ b/nni/retiarii/execution/cgo_engine.py
@@ -42,16 +42,20 @@ class CGOExecutionEngine(AbstractExecutionEngine):
 
     Parameters
     ----------
-    devices : List[Device]
-        Available devices for execution.
-    max_concurrency : int
+    training_service
+        The remote training service config.
+    max_concurrency
         The maximum number of trials to run concurrently.
-    batch_waiting_time: int
+    batch_waiting_time
         Seconds to wait for each batch of trial submission.
         The trials within one batch could apply cross-graph optimization.
+    rest_port
+        The port of the experiment's rest server
+    rest_url_prefix
+        The url prefix of the experiment's rest entry
     """
 
-    def __init__(self, training_service,
+    def __init__(self, training_service: RemoteConfig,
                  max_concurrency: int = None,
                  batch_waiting_time: int = 60,
                  rest_port: int | None = None,
diff --git a/nni/retiarii/experiment/config/engine_config.py b/nni/retiarii/experiment/config/engine_config.py
index 042e227012..2147147622 100644
--- a/nni/retiarii/experiment/config/engine_config.py
+++ b/nni/retiarii/experiment/config/engine_config.py
@@ -11,8 +11,6 @@
 
 @dataclass(init=False)
 class ExecutionEngineConfig(ConfigBase):
-    """
-    """
     name: str
 
 @dataclass(init=False)

From 879aa567f9ff315295e122b6bbcd8141a1cb1b78 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 10 May 2022 22:00:18 +0800
Subject: [PATCH 15/77] minor

---
 nni/retiarii/experiment/pytorch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 6736ca2561..8e8243f780 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -18,6 +18,7 @@
 import torch.nn as nn
 import nni.runtime.log
 from nni.experiment import Experiment, RunMode, launcher, management
+from nni.experiment.config.training_services import RemoteConfig
 from nni.runtime.protocol import connect_websocket
 
 from .config import (
@@ -240,7 +241,7 @@ def _create_execution_engine(self) -> None:
                 "CGO execution engine currently only supports remote training service"
             assert self.config.execution_engine.batch_waiting_time is not None \
                 and self.config.execution_engine.max_concurrency_cgo is not None
-            engine = CGOExecutionEngine(self.config.training_service,
+            engine = CGOExecutionEngine(cast(RemoteConfig, self.config.training_service),
                                         max_concurrency=self.config.execution_engine.max_concurrency_cgo,
                                         batch_waiting_time=self.config.execution_engine.batch_waiting_time,
                                         rest_port=self.port,

From 1d723ad77167aa6a048f670aeca3c26f0df7a02c Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 10 May 2022 22:27:48 +0800
Subject: [PATCH 16/77] update

---
 nni/retiarii/experiment/config/experiment_config.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
index 456b404aa1..b21a5b71df 100644
--- a/nni/retiarii/experiment/config/experiment_config.py
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -39,14 +39,10 @@ def __init__(self, training_service_platform: str | None = None,
                  **kwargs):
         super().__init__(training_service_platform, **kwargs)
 
-        if execution_engine is not None:
-            # the user chose to init with `config = ExperimentConfig('local')` and set fields later
-            # we need to create empty training service & algorithm configs to support `config.tuner.name = 'random'`
-            assert utils.is_missing(self.execution_engine)
-            if isinstance(execution_engine, str):
-                self.execution_engine = execution_engine_config_factory(execution_engine)
-            else:
-                self.execution_engine = execution_engine
+        if isinstance(execution_engine, str):
+            self.execution_engine = execution_engine_config_factory(execution_engine)
+        else:
+            self.execution_engine = execution_engine
 
         self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name
 

From 3d9e10cea9465d82acd8b2b9b3edbac29bf584de Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Sun, 15 May 2022 17:20:33 +0800
Subject: [PATCH 17/77] resolve some comments

---
 .../experiment/config/experiment_config.py    | 45 ++++++------
 nni/retiarii/experiment/pytorch.py            | 71 ++++++++++---------
 2 files changed, 60 insertions(+), 56 deletions(-)

diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
index b21a5b71df..72bc6c1125 100644
--- a/nni/retiarii/experiment/config/experiment_config.py
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -8,7 +8,7 @@
 
 from nni.experiment.config import utils, ExperimentConfig
 
-from .engine_config import ExecutionEngineConfig, PyEngineConfig
+from .engine_config import ExecutionEngineConfig
 
 __all__ = ['RetiariiExeConfig']
 
@@ -32,33 +32,30 @@ class RetiariiExeConfig(ExperimentConfig):
     trial_code_directory: utils.PathLike = '.'
     trial_command: str = '_reserved'
     # new config field for NAS
-    execution_engine: ExecutionEngineConfig = PyEngineConfig()
+    execution_engine: str | ExecutionEngineConfig
 
     def __init__(self, training_service_platform: str | None = None,
-                 execution_engine: str | ExecutionEngineConfig = PyEngineConfig(),
+                 execution_engine: str | ExecutionEngineConfig = 'py',
                  **kwargs):
         super().__init__(training_service_platform, **kwargs)
-
-        if isinstance(execution_engine, str):
-            self.execution_engine = execution_engine_config_factory(execution_engine)
-        else:
-            self.execution_engine = execution_engine
-
-        self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name
-
-    def __setattr__(self, key, value):
-        #TODO: tuner settings can also be blocked here
-        fixed_attrs = {'search_space': '',
-                       'trial_command': '_reserved'}
-        if key in fixed_attrs and fixed_attrs[key] != value:
-            raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!')
-        # 'trial_code_directory' is handled differently because the path will be converted to absolute path by us
-        if key == 'trial_code_directory' and not (str(value) == '.' or os.path.isabs(value)):
-            raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!')
-        #if key == 'execution_engine':
-        #    assert value in ['base', 'py', 'cgo', 'benchmark', 'oneshot'], f'The specified execution engine "{value}" is not supported.'
-        #    self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + value
-        super().__setattr__(key, value) #TODO: double check whether new fields are validated
+        self.execution_engine = execution_engine
+
+    def _canonicalize(self, _parents):
+        msg = '{} is not supposed to be set in Retiarii experiment by users, your config is {}.'
+        if self.search_space != '':
+            raise ValueError(msg.format('search_space', self.search_space))
+        if str(self.trial_code_directory) != '.' and not os.path.isabs(self.trial_code_directory):
+            raise ValueError(msg.format('trial_code_directory', self.trial_code_directory))
+        if self.trial_command != '_reserved' and \
+            not self.trial_command.startswith('python3 -m nni.retiarii.trial_entry '):
+            raise ValueError(msg.format('trial_command', self.trial_command))
+
+        if isinstance(self.execution_engine, str):
+            self.execution_engine = execution_engine_config_factory(self.execution_engine)
+        if self.execution_engine.name in ('py', 'base', 'cgo'):
+            self.trial_command = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name
+
+        super()._canonicalize([self])
 
     def _validate_canonical(self):
         super()._validate_canonical(False)
\ No newline at end of file
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 8e8243f780..0dba05cf36 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -8,7 +8,7 @@
 import warnings
 from subprocess import Popen
 from threading import Thread
-from typing import Any, List, Optional, Union, cast
+from typing import Any, List, Union, cast
 
 import colorama
 import psutil
@@ -205,6 +205,9 @@ def __init__(self, base_model: nn.Module,
         self.applied_mutators = applied_mutators
         self.strategy = strategy
 
+        self._dispatcher = None
+        self._dispatcher_thread = None
+
         # check for sanity
         if not is_model_wrapped(base_model):
             warnings.warn(colorama.Style.BRIGHT + colorama.Fore.RED +
@@ -212,12 +215,12 @@ def __init__(self, base_model: nn.Module,
                           'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL,
                           RuntimeWarning)
 
-    def _run_strategy(self):
+    def _run_strategy(self, config: RetiariiExeConfig):
         base_model_ir, self.applied_mutators = preprocess_model(
             self.base_model, self.evaluator, self.applied_mutators,
-            full_ir=not isinstance(self.config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)),
-            dummy_input=self.config.execution_engine.dummy_input
-                if isinstance(self.config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None
+            full_ir=not isinstance(config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)),
+            dummy_input=config.execution_engine.dummy_input
+                if isinstance(config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None
         )
 
         _logger.info('Start strategy...')
@@ -226,39 +229,39 @@ def _run_strategy(self):
         self.strategy.run(base_model_ir, self.applied_mutators)
         _logger.info('Strategy exit')
         # TODO: find out a proper way to show no more trial message on WebUI
-        # self._dispatcher.mark_experiment_as_ending()
 
-    def _create_execution_engine(self) -> None:
+    def _create_execution_engine(self, config: RetiariiExeConfig) -> None:
         #TODO: we will probably need a execution engine factory to make this clean and elegant
-        if isinstance(self.config.execution_engine, BaseEngineConfig):
+        if isinstance(config.execution_engine, BaseEngineConfig):
             from ..execution.base import BaseExecutionEngine
             engine = BaseExecutionEngine(self.port, self.url_prefix)
-        elif isinstance(self.config.execution_engine, CgoEngineConfig):
+        elif isinstance(config.execution_engine, CgoEngineConfig):
             from ..execution.cgo_engine import CGOExecutionEngine
 
-            assert not isinstance(self.config.training_service, list) \
-                and self.config.training_service.platform == 'remote', \
+            assert not isinstance(config.training_service, list) \
+                and config.training_service.platform == 'remote', \
                 "CGO execution engine currently only supports remote training service"
-            assert self.config.execution_engine.batch_waiting_time is not None \
-                and self.config.execution_engine.max_concurrency_cgo is not None
-            engine = CGOExecutionEngine(cast(RemoteConfig, self.config.training_service),
-                                        max_concurrency=self.config.execution_engine.max_concurrency_cgo,
-                                        batch_waiting_time=self.config.execution_engine.batch_waiting_time,
+            assert config.execution_engine.batch_waiting_time is not None \
+                and config.execution_engine.max_concurrency_cgo is not None
+            engine = CGOExecutionEngine(cast(RemoteConfig, config.training_service),
+                                        max_concurrency=config.execution_engine.max_concurrency_cgo,
+                                        batch_waiting_time=config.execution_engine.batch_waiting_time,
                                         rest_port=self.port,
                                         rest_url_prefix=self.url_prefix)
-        elif isinstance(self.config.execution_engine, PyEngineConfig):
+        elif isinstance(config.execution_engine, PyEngineConfig):
             from ..execution.python import PurePythonExecutionEngine
             engine = PurePythonExecutionEngine(self.port, self.url_prefix)
-        elif isinstance(self.config.execution_engine, BenchmarkEngineConfig):
+        elif isinstance(config.execution_engine, BenchmarkEngineConfig):
             from ..execution.benchmark import BenchmarkExecutionEngine
-            assert self.config.execution_engine.benchmark is not None, \
+            assert config.execution_engine.benchmark is not None, \
                 '"benchmark" must be set when benchmark execution engine is used.'
-            engine = BenchmarkExecutionEngine(self.config.execution_engine.benchmark)
+            engine = BenchmarkExecutionEngine(config.execution_engine.benchmark)
         else:
-            raise ValueError(f'Unsupported engine type: {self.config.execution_engine}')
+            raise ValueError(f'Unsupported engine type: {config.execution_engine}')
         set_execution_engine(engine)
 
-    def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None:
+    def start(self, port: int = 8080, debug: bool = False,
+              run_mode: RunMode = RunMode.Background) -> RetiariiExeConfig:
         """
         Start the experiment in background.
         This method will raise exception on failure.
@@ -282,14 +285,12 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
         self._start_end(port, config.nni_manager_ip)
 
         self._dispatcher = RetiariiAdvisor()
-        # dispatcher must be launched after pipe initialized
-        # the logic to launch dispatcher in background should be refactored into dispatcher api
         self._dispatcher_thread = Thread(target=self._dispatcher.run)
         self._dispatcher_thread.start()
+        return config
 
-        self._create_execution_engine() # FIXME: engine cannot be created twice
-
-    def run(self, config: Optional[RetiariiExeConfig] = None,
+    def run(self,
+            config: RetiariiExeConfig | None = None,
             port: int = 8080,
             debug: bool = False) -> None:
         """
@@ -308,20 +309,26 @@ def run(self, config: Optional[RetiariiExeConfig] = None,
         if config is None:
             self.config = RetiariiExeConfig()
             self.config.execution_engine = OneshotEngineConfig()
+        else:
+            self.config = config
+
+        if isinstance(self.config.execution_engine, OneshotEngineConfig) \
+            or (isinstance(self.config.execution_engine, str) and self.config.execution_engine == 'oneshot'):
+            # this is hacky, will be refactored when oneshot can run on training services
             base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
-            # FIXME: oneshot strategy should also be executable on training services
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
-            self.config = config
-            self.start(port, debug)
+            config = self.start(port, debug)
+            # FIXME: engine cannot be created twice
+            self._create_execution_engine(config)
             try:
-                self._run_strategy()
+                self._run_strategy(config)
                 # FIXME: move this logic to strategy with a new API provided by execution engine
                 self._wait_completion()
             except KeyboardInterrupt:
                 _logger.warning('KeyboardInterrupt detected')
                 self.stop()
-            _logger.info('Search process is done, the experiment is still alive')
+            _logger.info('Search process is done, the experiment is still alive, `stop()` can terminate the experiment.')
 
     def stop(self) -> None:
         """

From a8c15ea7ed7ce942782d849ac46e088dbc68ceff Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Sun, 15 May 2022 18:05:28 +0800
Subject: [PATCH 18/77] resolve comments

---
 nni/retiarii/experiment/pytorch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 0dba05cf36..c87cdcf1bf 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -307,6 +307,8 @@ def run(self,
             return
 
         if config is None:
+            warnings.warn('config = None is deprecate in future. If you are running a one-shot experiment, '
+                          'please consider creating a config and set execution engine to `oneshot`.', DeprecationWarning)
             self.config = RetiariiExeConfig()
             self.config.execution_engine = OneshotEngineConfig()
         else:

From 5f4b32c52f5c2b781d164af5e04c48857c77952a Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Sun, 15 May 2022 22:37:07 +0800
Subject: [PATCH 19/77] minor

---
 nni/retiarii/experiment/pytorch.py | 8 --------
 nni/runtime/msg_dispatcher_base.py | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index ce8293391e..591897014b 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -224,8 +224,6 @@ def _run_strategy(self, config: RetiariiExeConfig):
 
         _logger.info('Start strategy...')
         search_space = dry_run_for_formatted_search_space(base_model_ir, self.applied_mutators)
-        #import time
-        #time.sleep(10)
         self.update_search_space(search_space)
         self.strategy.run(base_model_ir, self.applied_mutators)
         _logger.info('Strategy exit')
@@ -284,12 +282,6 @@ def start(self, port: int = 8080, debug: bool = False,
 
         self._start_end(port, config.nni_manager_ip)
 
-        #from nni.experiment import rest
-        #from collections import OrderedDict
-        #import time
-        #time.sleep(10)
-        #rest.put(8090, '/experiment?update_type=SEARCH_SPACE', {'params': {'experimentName': 'mnist_search', 'searchSpace': OrderedDict([('model_1', {'_type': 'choice', '_value': ['0', '1']}), ('model_2', {'_type': 'choice', '_value': [0.25, 0.5, 0.75]}), ('model_3', {'_type': 'choice', '_value': [64, 128, 256]})]), 'trialCommand': 'python3 -m nni.retiarii.trial_entry py', 'trialCodeDirectory': '.', 'trialConcurrency': 2, 'maxTrialNumber': 4, 'useAnnotation': False, 'debug': False, 'logLevel': 'info', 'experimentWorkingDirectory': '/home/quzha/nni-experiments', 'trainingService': {'platform': 'local', 'trialCommand': 'python3 -m nni.retiarii.trial_entry py', 'trialCodeDirectory': '/home/quzha/nni/nni/examples/nas/multi-trial/mnist', 'debug': False, 'useActiveGpu': False, 'maxTrialNumberPerGpu': 1, 'reuseMode': False}, 'executionEngine': {'name': 'py'}}, 'id': 'mn7j1h0g', 'execDuration': 0, 'logDir': '/home/quzha/nni-experiments/mn7j1h0g', 'startTime': 1652616830914, 'nextSequenceId': 0, 'revision': 2})
-
         self._dispatcher = RetiariiAdvisor(ws_url)
         self._dispatcher_thread = Thread(target=self._dispatcher.run)
         self._dispatcher_thread.start()
diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py
index 5f57937db8..43c14b4af7 100644
--- a/nni/runtime/msg_dispatcher_base.py
+++ b/nni/runtime/msg_dispatcher_base.py
@@ -27,7 +27,7 @@ def __init__(self, command_channel_url=None):
         if command_channel_url is None:
             command_channel_url = dispatcher_env_vars.NNI_TUNER_COMMAND_CHANNEL
         self._channel = TunerCommandChannel(command_channel_url)
-        # NOTE: `connect()` should be put in __init__. First, this `connect()` affects nnimanager's 
+        # NOTE: `connect()` should be put in __init__. First, this `connect()` affects nnimanager's
         # starting process, without `connect()` nnimanager is blocked in `dispatcher.init()`.
         # Second, nas experiment uses a thread to execute `run()` of this class, thus, there is
         # no way to know when the websocket between nnimanager and dispatcher is built. The following

From 6743fa79ec74de5b54e5cf765e99cf25e5315637 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 16 May 2022 08:30:08 +0800
Subject: [PATCH 20/77] pyright

---
 nni/retiarii/experiment/pytorch.py | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 591897014b..494a828b74 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -285,7 +285,7 @@ def start(self, port: int = 8080, debug: bool = False,
         self._dispatcher = RetiariiAdvisor(ws_url)
         self._dispatcher_thread = Thread(target=self._dispatcher.run)
         self._dispatcher_thread.start()
-        return config
+        return cast(RetiariiExeConfig, config)
 
     def run(self,
             config: RetiariiExeConfig | None = None,
@@ -388,26 +388,3 @@ def retrain_model(self, model):
         this function retrains the exported model, and test it to output test accuracy
         """
         raise NotImplementedError
-
-"""
-class NasExperiment(RetiariiExperiment):
-
-    #This class is only a new interface wrapper.
-
-    def __init__(self, model: nn.Module,
-                 evaluator: Union[BaseOneShotTrainer, Evaluator],
-                 strategy: BaseStrategy,
-                 config_or_platform: ExperimentConfig | str | list[str] | None = 'local',
-                 execution_engine: Union[str, ExecutionEngineConfig] = 'py',
-                 mutators: List[Mutator] = cast(List[Mutator], None)):
-        ...
-
-    def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None:
-        #Run the experiment.
-        #This function will block until experiment finish or error.
-        if isinstance(self.config.execution_engine.name, OneshotEngineConfig):
-            base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
-            self.strategy.run(base_model_ir, self.applied_mutators)
-        else:
-            super().run(port, wait_completion, debug)
-"""
\ No newline at end of file

From d1ea7f5a68de46f4ef64684f1dd9b50f2fd2368d Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 16 May 2022 14:14:35 +0800
Subject: [PATCH 21/77] fix ut

---
 nni/runtime/msg_dispatcher_base.py  | 13 +++++++++++--
 test/ut/retiarii/test_cgo_engine.py |  6 +++---
 test/ut/retiarii/test_engine.py     |  4 ++--
 test/ut/sdk/test_assessor.py        |  2 +-
 test/ut/sdk/test_msg_dispatcher.py  |  2 +-
 5 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py
index 43c14b4af7..82d90d9549 100644
--- a/nni/runtime/msg_dispatcher_base.py
+++ b/nni/runtime/msg_dispatcher_base.py
@@ -18,8 +18,15 @@
 
 
 class MsgDispatcherBase(Recoverable):
-    """This is where tuners and assessors are not defined yet.
+    """
+    This is where tuners and assessors are not defined yet.
     Inherits this class to make your own advisor.
+
+    .. note::
+
+        The class inheriting MsgDispatcherBase should be instantiated
+        after nnimanager (rest server) is started, so that the object
+        is ready to use right after its instantiation.
     """
 
     def __init__(self, command_channel_url=None):
@@ -34,7 +41,9 @@ def __init__(self, command_channel_url=None):
         # logic may crash is websocket is not built. One example is updating search space. If updating
         # search space too soon, as the websocket has not been built, the rest api of updating search
         # space will timeout.
-        self._channel.connect()
+        # FIXME: this is making unittest happy
+        if command_channel_url.startswith('ws://_unittest_'):
+            self._channel.connect()
         self.default_command_queue = Queue()
         self.assessor_command_queue = Queue()
         self.default_worker = threading.Thread(target=self.command_queue_worker, args=(self.default_command_queue,))
diff --git a/test/ut/retiarii/test_cgo_engine.py b/test/ut/retiarii/test_cgo_engine.py
index 67dde09380..8d67b26630 100644
--- a/test/ut/retiarii/test_cgo_engine.py
+++ b/test/ut/retiarii/test_cgo_engine.py
@@ -263,7 +263,7 @@ def test_dedup_input_four_devices(self):
         opt = DedupInputOptimizer()
         opt.convert(lp)
 
-        advisor = RetiariiAdvisor('ws://_placeholder_')
+        advisor = RetiariiAdvisor('ws://_unittest_placeholder_')
         advisor._channel = protocol.LegacyCommandChannel()
         advisor.default_worker.start()
         advisor.assessor_worker.start()
@@ -286,7 +286,7 @@ def test_dedup_input_two_devices(self):
         opt = DedupInputOptimizer()
         opt.convert(lp)
 
-        advisor = RetiariiAdvisor('ws://_placeholder_')
+        advisor = RetiariiAdvisor('ws://_unittest_placeholder_')
         advisor._channel = protocol.LegacyCommandChannel()
         advisor.default_worker.start()
         advisor.assessor_worker.start()
@@ -311,7 +311,7 @@ def test_submit_models(self):
 
         models = _load_mnist(2)
 
-        advisor = RetiariiAdvisor('ws://_placeholder_')
+        advisor = RetiariiAdvisor('ws://_unittest_placeholder_')
         advisor._channel = protocol.LegacyCommandChannel()
         advisor.default_worker.start()
         advisor.assessor_worker.start()
diff --git a/test/ut/retiarii/test_engine.py b/test/ut/retiarii/test_engine.py
index 8e8f050c1a..c8cd760b8c 100644
--- a/test/ut/retiarii/test_engine.py
+++ b/test/ut/retiarii/test_engine.py
@@ -25,7 +25,7 @@ def test_codegen(self):
     def test_base_execution_engine(self):
         nni.retiarii.integration_api._advisor = None
         nni.retiarii.execution.api._execution_engine = None
-        advisor = RetiariiAdvisor('ws://_placeholder_')
+        advisor = RetiariiAdvisor('ws://_unittest_placeholder_')
         advisor._channel = LegacyCommandChannel()
         advisor.default_worker.start()
         advisor.assessor_worker.start()
@@ -42,7 +42,7 @@ def test_base_execution_engine(self):
     def test_py_execution_engine(self):
         nni.retiarii.integration_api._advisor = None
         nni.retiarii.execution.api._execution_engine = None
-        advisor = RetiariiAdvisor('ws://_placeholder_')
+        advisor = RetiariiAdvisor('ws://_unittest_placeholder_')
         advisor._channel = LegacyCommandChannel()
         advisor.default_worker.start()
         advisor.assessor_worker.start()
diff --git a/test/ut/sdk/test_assessor.py b/test/ut/sdk/test_assessor.py
index 0d5e078027..48c2c03324 100644
--- a/test/ut/sdk/test_assessor.py
+++ b/test/ut/sdk/test_assessor.py
@@ -57,7 +57,7 @@ def test_assessor(self):
         _restore_io()
 
         assessor = NaiveAssessor()
-        dispatcher = MsgDispatcher('ws://_placeholder_', None, assessor)
+        dispatcher = MsgDispatcher('ws://_unittest_placeholder_', None, assessor)
         dispatcher._channel = LegacyCommandChannel()
         msg_dispatcher_base._worker_fast_exit_on_terminate = False
 
diff --git a/test/ut/sdk/test_msg_dispatcher.py b/test/ut/sdk/test_msg_dispatcher.py
index 356308501c..643d4d9b7b 100644
--- a/test/ut/sdk/test_msg_dispatcher.py
+++ b/test/ut/sdk/test_msg_dispatcher.py
@@ -66,7 +66,7 @@ def test_msg_dispatcher(self):
         _restore_io()
 
         tuner = NaiveTuner()
-        dispatcher = MsgDispatcher('ws://_placeholder_', tuner)
+        dispatcher = MsgDispatcher('ws://_unittest_placeholder_', tuner)
         dispatcher._channel = LegacyCommandChannel()
         msg_dispatcher_base._worker_fast_exit_on_terminate = False
 

From a3b55c2b2ca9e8d96f65a4db05c6b0ff38d2df4e Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 16 May 2022 15:57:51 +0800
Subject: [PATCH 22/77] minor

---
 nni/runtime/msg_dispatcher_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py
index 82d90d9549..6873ff2409 100644
--- a/nni/runtime/msg_dispatcher_base.py
+++ b/nni/runtime/msg_dispatcher_base.py
@@ -42,7 +42,7 @@ def __init__(self, command_channel_url=None):
         # search space too soon, as the websocket has not been built, the rest api of updating search
         # space will timeout.
         # FIXME: this is making unittest happy
-        if command_channel_url.startswith('ws://_unittest_'):
+        if not command_channel_url.startswith('ws://_unittest_'):
             self._channel.connect()
         self.default_command_queue = Queue()
         self.assessor_command_queue = Queue()

From f895116015efd6c5454d013138f4511d05b171dd Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 16 May 2022 18:06:16 +0800
Subject: [PATCH 23/77] fix cgo pipe

---
 test/ut/retiarii/test_cgo_engine.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/test/ut/retiarii/test_cgo_engine.py b/test/ut/retiarii/test_cgo_engine.py
index 8d67b26630..5a3605eb21 100644
--- a/test/ut/retiarii/test_cgo_engine.py
+++ b/test/ut/retiarii/test_cgo_engine.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 
 import nni
+from nni.experiment.config import RemoteConfig, RemoteMachineConfig
 import nni.runtime.platform.test
 from nni.runtime.tuner_command_channel import legacy as protocol
 import json
@@ -268,8 +269,9 @@ def test_dedup_input_four_devices(self):
         advisor.default_worker.start()
         advisor.assessor_worker.start()
 
-        available_devices = [GPUDevice("test", 0), GPUDevice("test", 1), GPUDevice("test", 2), GPUDevice("test", 3)]
-        cgo = CGOExecutionEngine(devices=available_devices, batch_waiting_time=0)
+        remote = RemoteConfig(machine_list=[])
+        remote.machine_list.append(RemoteMachineConfig(host='test', gpu_indices=[0,1,2,3]))
+        cgo = CGOExecutionEngine(training_service=remote, batch_waiting_time=0)
 
         phy_models = cgo._assemble(lp)
         self.assertTrue(len(phy_models) == 1)
@@ -291,8 +293,9 @@ def test_dedup_input_two_devices(self):
         advisor.default_worker.start()
         advisor.assessor_worker.start()
 
-        available_devices = [GPUDevice("test", 0), GPUDevice("test", 1)]
-        cgo = CGOExecutionEngine(devices=available_devices, batch_waiting_time=0)
+        remote = RemoteConfig(machine_list=[])
+        remote.machine_list.append(RemoteMachineConfig(host='test', gpu_indices=[0,1]))
+        cgo = CGOExecutionEngine(training_service=remote, batch_waiting_time=0)
 
         phy_models = cgo._assemble(lp)
         self.assertTrue(len(phy_models) == 2)
@@ -316,8 +319,9 @@ def test_submit_models(self):
         advisor.default_worker.start()
         advisor.assessor_worker.start()
 
-        cgo_engine = CGOExecutionEngine(devices=[GPUDevice("test", 0), GPUDevice("test", 1),
-                                                 GPUDevice("test", 2), GPUDevice("test", 3)], batch_waiting_time=0)
+        remote = RemoteConfig(machine_list=[])
+        remote.machine_list.append(RemoteMachineConfig(host='test', gpu_indices=[0,1,2,3]))
+        cgo_engine = CGOExecutionEngine(training_service=remote, batch_waiting_time=0)
         set_execution_engine(cgo_engine)
         submit_models(*models)
         time.sleep(3)

From 33fd0b048b7cf8da92a47fa4baae3e7da0a47d0b Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 16 May 2022 20:02:49 +0800
Subject: [PATCH 24/77] refactor

---
 nni/experiment/experiment.py       | 25 +++++++-------
 nni/retiarii/experiment/pytorch.py | 52 +++++++-----------------------
 2 files changed, 25 insertions(+), 52 deletions(-)

diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index feef6677a3..b9a07a8a1a 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -87,7 +87,10 @@ def __init__(self, config_or_platform: ExperimentConfig | str | list[str] | None
         else:
             self.config = config_or_platform
 
-    def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig:
+    def _start_impl(self, port: int, debug: bool, run_mode: RunMode,
+                    url_prefix: str | None,
+                    tuner_command_channel: str | None,
+                    tags: list[str] = []) -> ExperimentConfig:
         assert self.config is not None
         if run_mode is not RunMode.Detach:
             atexit.register(self.stop)
@@ -101,10 +104,14 @@ def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig:
         else:  # this should never happen in latest version, keep it until v2.7 for potential compatibility
             log_dir = Path.home() / f'nni-experiments/{self.id}/log'
         nni.runtime.log.start_experiment_log(self.id, log_dir, debug)
-        return config
 
-    def _start_end(self, port: int, nni_manager_ip: Optional[str]) -> None:
-        ips = [nni_manager_ip]
+        self._proc = launcher.start_experiment(self._action, self.id, config, port, debug, run_mode,
+                                               url_prefix, tuner_command_channel, tags)
+        assert self._proc is not None
+
+        self.port = port  # port will be None if start up failed
+
+        ips = [config.nni_manager_ip]
         for interfaces in psutil.net_if_addrs().values():
             for interface in interfaces:
                 if interface.family == socket.AF_INET:
@@ -112,6 +119,7 @@ def _start_end(self, port: int, nni_manager_ip: Optional[str]) -> None:
         ips = [f'http://{ip}:{port}' for ip in ips if ip]
         msg = 'Web portal URLs: ' + colorama.Fore.CYAN + ' '.join(ips) + colorama.Style.RESET_ALL
         _logger.info(msg)
+        return config
 
     def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None:
         """
@@ -129,14 +137,7 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
         run_mode
             Running the experiment in foreground or background
         """
-        config = self._start_begin(debug, run_mode)
-
-        self._proc = launcher.start_experiment(self._action, self.id, config, port, debug, run_mode, self.url_prefix)
-        assert self._proc is not None
-
-        self.port = port  # port will be None if start up failed
-
-        self._start_end(port, config.nni_manager_ip)
+        self._start_impl(port, debug, run_mode, self.url_prefix, None, [])
 
     def _stop(self) -> None:
         atexit.unregister(self.stop)
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 494a828b74..a8d9f38823 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -182,7 +182,8 @@ def __init__(self, base_model: nn.Module,
                  applied_mutators: List[Mutator] = cast(List[Mutator], None),
                  strategy: BaseStrategy = cast(BaseStrategy, None),
                  trainer: BaseOneShotTrainer = cast(BaseOneShotTrainer, None)):
-        nni.runtime.log.init_logger_for_command_line()
+        super().__init__(None)
+        self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None)
 
         if trainer is not None:
             warnings.warn('Usage of `trainer` in RetiariiExperiment is deprecated and will be removed soon. '
@@ -192,13 +193,6 @@ def __init__(self, base_model: nn.Module,
         if evaluator is None:
             raise ValueError('Evaluator should not be none.')
 
-        self.id: str = management.generate_experiment_id()
-        self.port: int | None = None
-        self._proc: Popen | psutil.Process | None = None
-        self._action: Literal['create', 'resume', 'view'] = 'create'
-        self.url_prefix: str | None = None
-        self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None)
-
         self.base_model = base_model
         self.evaluator: Union[Evaluator, BaseOneShotTrainer] = evaluator
         self.applied_mutators = applied_mutators
@@ -259,33 +253,13 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None:
             raise ValueError(f'Unsupported engine type: {config.execution_engine}')
         set_execution_engine(engine)
 
-    def start(self, port: int = 8080, debug: bool = False,
-              run_mode: RunMode = RunMode.Background) -> RetiariiExeConfig:
+    def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None:
         """
-        Start the experiment in background.
-        This method will raise exception on failure.
-        If it returns, the experiment should have been successfully started.
-        Parameters
-        ----------
-        port
-            The port of web UI.
-        debug
-            Whether to start in debug mode.
+        By design, the only different between `start` and `run` is that `start` is asynchronous,
+        while `run` waits the experiment to complete. RetiariiExperiment always waits the experiment
+        to complete as strategy runs in foreground.
         """
-        config = self._start_begin(debug, run_mode)
-
-        ws_url = f'ws://localhost:{port}/tuner'
-        self._proc = launcher.start_experiment('create', self.id, config, port, debug,  # type: ignore
-                                               RunMode.Background, None, ws_url, ['retiarii'])
-        assert self._proc is not None
-        self.port = port  # port will be None if start up failed
-
-        self._start_end(port, config.nni_manager_ip)
-
-        self._dispatcher = RetiariiAdvisor(ws_url)
-        self._dispatcher_thread = Thread(target=self._dispatcher.run)
-        self._dispatcher_thread.start()
-        return cast(RetiariiExeConfig, config)
+        raise NotImplementedError('RetiariiExperiment is not supposed to provide `start` method')
 
     def run(self,
             config: RetiariiExeConfig | None = None,
@@ -318,7 +292,11 @@ def run(self,
             base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
-            config = self.start(port, debug)
+            ws_url = f'ws://localhost:{port}/tuner'
+            config = self._start_impl(port, debug, RunMode.Background, None, ws_url, ['retiarii'])
+            self._dispatcher = RetiariiAdvisor(ws_url)
+            self._dispatcher_thread = Thread(target=self._dispatcher.run)
+            self._dispatcher_thread.start()
             # FIXME: engine cannot be created twice
             self._create_execution_engine(config)
             try:
@@ -382,9 +360,3 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for
                 return [model_to_pytorch_script(model) for model in all_models[:top_k]]
             elif formatter == 'dict':
                 return [get_mutation_dict(model) for model in all_models[:top_k]]
-
-    def retrain_model(self, model):
-        """
-        this function retrains the exported model, and test it to output test accuracy
-        """
-        raise NotImplementedError

From 7609983a689be87e0932ae030988b91ebe3afe0c Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 16 May 2022 20:34:08 +0800
Subject: [PATCH 25/77] fix pylint

---
 nni/experiment/experiment.py       | 2 +-
 nni/retiarii/experiment/pytorch.py | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index b9a07a8a1a..194fc6e572 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -10,7 +10,7 @@
 import socket
 from subprocess import Popen
 import time
-from typing import Any, Optional
+from typing import Any
 
 import colorama
 import psutil
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index a8d9f38823..2ddaefbe79 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -6,18 +6,14 @@
 import logging
 
 import warnings
-from subprocess import Popen
 from threading import Thread
 from typing import Any, List, Union, cast
 
 import colorama
-import psutil
-from typing_extensions import Literal
 
 import torch
 import torch.nn as nn
-import nni.runtime.log
-from nni.experiment import Experiment, RunMode, launcher, management
+from nni.experiment import Experiment, RunMode
 from nni.experiment.config.training_services import RemoteConfig
 
 from .config import (

From c51a5200407106a09a578410d882e20cff40e875 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 16 May 2022 20:38:09 +0800
Subject: [PATCH 26/77] minor

---
 nni/retiarii/execution/api.py      | 8 +++++---
 nni/retiarii/experiment/pytorch.py | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/nni/retiarii/execution/api.py b/nni/retiarii/execution/api.py
index d0028e5e72..1d9c6bf5c5 100644
--- a/nni/retiarii/execution/api.py
+++ b/nni/retiarii/execution/api.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 import time
+import warnings
 from typing import Iterable
 
 from ..graph import Model, ModelStatus
@@ -21,9 +22,10 @@ def set_execution_engine(engine: AbstractExecutionEngine) -> None:
     if _execution_engine is None:
         _execution_engine = engine
     else:
-        raise RuntimeError('Execution engine is already set. '
-                           'You should avoid instantiating RetiariiExperiment twice in one process. '
-                           'If you are running in a Jupyter notebook, please restart the kernel.')
+        warnings.warn('Execution engine is already set. '
+                      'You should avoid instantiating RetiariiExperiment twice in one process. '
+                      'If you are running in a Jupyter notebook, please restart the kernel.',
+                      RuntimeWarning)
 
 
 def get_execution_engine() -> AbstractExecutionEngine:
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 2ddaefbe79..bb1cb9259d 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -338,7 +338,8 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for
             If ``dict``, the mutation history will be returned.
         """
         if formatter == 'code':
-            assert not isinstance(self.config.execution_engine, PyEngineConfig), \
+            config = self.config.canonical_copy()
+            assert not isinstance(config.execution_engine, PyEngineConfig), \
                 'You should use `dict` formatter when using Python execution engine.'
         if isinstance(self.evaluator, BaseOneShotTrainer):
             assert top_k == 1, 'Only support top_k is 1 for now.'

From 7edef1abec6dbd9139ecc7dcc15ed3d371f785d4 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 16 May 2022 22:08:04 +0800
Subject: [PATCH 27/77] fix pyright

---
 nni/retiarii/experiment/pytorch.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index bb1cb9259d..849fd02863 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -289,14 +289,15 @@ def run(self,
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
             ws_url = f'ws://localhost:{port}/tuner'
-            config = self._start_impl(port, debug, RunMode.Background, None, ws_url, ['retiarii'])
+            canonicalized_config = self._start_impl(port, debug, RunMode.Background, None, ws_url, ['retiarii'])
+            canonicalized_config = cast(RetiariiExeConfig, canonicalized_config)
             self._dispatcher = RetiariiAdvisor(ws_url)
             self._dispatcher_thread = Thread(target=self._dispatcher.run)
             self._dispatcher_thread.start()
             # FIXME: engine cannot be created twice
-            self._create_execution_engine(config)
+            self._create_execution_engine(canonicalized_config)
             try:
-                self._run_strategy(config)
+                self._run_strategy(canonicalized_config)
                 # FIXME: move this logic to strategy with a new API provided by execution engine
                 self._wait_completion()
             except KeyboardInterrupt:

From 644cc72211d8545b0bee491016ab14872d21b0c7 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Fri, 20 May 2022 12:20:30 +0800
Subject: [PATCH 28/77] resolve comments

---
 nni/experiment/config/experiment_config.py          | 4 ++--
 nni/experiment/experiment.py                        | 4 ++--
 nni/retiarii/execution/api.py                       | 5 ++---
 nni/retiarii/experiment/config/experiment_config.py | 3 ---
 nni/retiarii/experiment/pytorch.py                  | 5 ++---
 nni/retiarii/integration_api.py                     | 5 ++++-
 6 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/nni/experiment/config/experiment_config.py b/nni/experiment/config/experiment_config.py
index e8791c0bf7..64af5e3af1 100644
--- a/nni/experiment/config/experiment_config.py
+++ b/nni/experiment/config/experiment_config.py
@@ -141,7 +141,7 @@ def _canonicalize(self, _parents):
                 msg = f'nni_manager_ip is not set, please make sure {ip} is accessible from training machines'
                 logging.getLogger('nni.experiment.config').warning(msg)
 
-    def _validate_canonical(self, validate_tuner: bool = True): # FIXME: remove validate_tuner
+    def _validate_canonical(self):
         super()._validate_canonical()
 
         space_cnt = (self.search_space is not None) + (self.search_space_file is not None)
@@ -164,7 +164,7 @@ def _validate_canonical(self, validate_tuner: bool = True): # FIXME: remove vali
         # currently I have only seen one issue of this kind
         #Path(self.experiment_working_directory).mkdir(parents=True, exist_ok=True)
 
-        if validate_tuner:
+        if type(self).__name__ != 'RetiariiExeConfig':
             utils.validate_gpu_indices(self.tuner_gpu_indices)
 
             if self.tuner is None:
diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index 194fc6e572..3438221b4b 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -139,7 +139,7 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
         """
         self._start_impl(port, debug, run_mode, self.url_prefix, None, [])
 
-    def _stop(self) -> None:
+    def _stop_impl(self) -> None:
         atexit.unregister(self.stop)
 
         nni.runtime.log.stop_experiment_log(self.id)
@@ -160,7 +160,7 @@ def stop(self) -> None:
         Stop the experiment.
         """
         _logger.info('Stopping experiment, please wait...')
-        self._stop()
+        self._stop_impl()
         _logger.info('Experiment stopped')
 
     def _wait_completion(self) -> bool:
diff --git a/nni/retiarii/execution/api.py b/nni/retiarii/execution/api.py
index 1d9c6bf5c5..01c85f81ee 100644
--- a/nni/retiarii/execution/api.py
+++ b/nni/retiarii/execution/api.py
@@ -19,13 +19,12 @@
 
 def set_execution_engine(engine: AbstractExecutionEngine) -> None:
     global _execution_engine
-    if _execution_engine is None:
-        _execution_engine = engine
-    else:
+    if _execution_engine is not None:
         warnings.warn('Execution engine is already set. '
                       'You should avoid instantiating RetiariiExperiment twice in one process. '
                       'If you are running in a Jupyter notebook, please restart the kernel.',
                       RuntimeWarning)
+    _execution_engine = engine
 
 
 def get_execution_engine() -> AbstractExecutionEngine:
diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
index 72bc6c1125..945bf3704e 100644
--- a/nni/retiarii/experiment/config/experiment_config.py
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -56,6 +56,3 @@ def _canonicalize(self, _parents):
             self.trial_command = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name
 
         super()._canonicalize([self])
-
-    def _validate_canonical(self):
-        super()._validate_canonical(False)
\ No newline at end of file
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 849fd02863..6f4aeef3c3 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -310,7 +310,7 @@ def stop(self) -> None:
         Stop background experiment.
         """
         _logger.info('Stopping experiment, please wait...')
-        self._stop()
+        self._stop_impl()
         if self._dispatcher_thread:
             self._dispatcher_thread.join()
         self._dispatcher = cast(RetiariiAdvisor, None)
@@ -319,8 +319,6 @@ def stop(self) -> None:
 
     def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', formatter: str = 'dict') -> Any:
         """
-        TODO: the base class may also need this method
-
         Export several top performing models.
 
         For one-shot algorithms, only top-1 is supported. For others, ``optimize_mode`` and ``formatter`` are
@@ -338,6 +336,7 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for
             If ``code``, the python code of model will be returned.
             If ``dict``, the mutation history will be returned.
         """
+        # TODO: the base class may also need this method
         if formatter == 'code':
             config = self.config.canonical_copy()
             assert not isinstance(config.execution_engine, PyEngineConfig), \
diff --git a/nni/retiarii/integration_api.py b/nni/retiarii/integration_api.py
index dfc77bdc2b..643758ec2a 100644
--- a/nni/retiarii/integration_api.py
+++ b/nni/retiarii/integration_api.py
@@ -22,7 +22,10 @@ def get_advisor() -> 'RetiariiAdvisor':
 
 def register_advisor(advisor: 'RetiariiAdvisor'):
     global _advisor
-    assert _advisor is None
+    if _advisor is not None:
+        warnings.warn('Advisor is already set.'
+                      'You should avoid instantiating RetiariiExperiment twice in one proces.'
+                      'If you are running in a Jupyter notebook, please restart the kernel.')
     _advisor = advisor
 
 

From d610d43d300a3485f628f6e004a7bdfee3e7840d Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 23 May 2022 09:25:48 +0800
Subject: [PATCH 29/77] resolve all the comments

---
 examples/nas/multi-trial/mnist/search.py | 5 ++---
 nni/experiment/experiment.py             | 5 ++---
 nni/retiarii/experiment/pytorch.py       | 6 +++---
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/examples/nas/multi-trial/mnist/search.py b/examples/nas/multi-trial/mnist/search.py
index 6ee65a70bb..52d1007493 100644
--- a/examples/nas/multi-trial/mnist/search.py
+++ b/examples/nas/multi-trial/mnist/search.py
@@ -131,7 +131,7 @@ def evaluate_model(model_cls):
     exp_config = RetiariiExeConfig('local')
     exp_config.experiment_name = 'mnist_search'
     exp_config.trial_concurrency = 2
-    exp_config.max_trial_number = 4
+    exp_config.max_trial_number = 20
     exp_config.training_service.use_active_gpu = False
     export_formatter = 'dict'
 
@@ -139,8 +139,7 @@ def evaluate_model(model_cls):
     # exp_config.execution_engine = 'base'
     # export_formatter = 'code'
 
-    exp.run(exp_config, 8090)
+    exp.run(exp_config, 8080)
     print('Final model:')
     for model_code in exp.export_top_models(formatter=export_formatter):
         print(model_code)
-    exp.stop()
\ No newline at end of file
diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py
index 3438221b4b..a5608e8dd8 100644
--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
@@ -88,7 +88,6 @@ def __init__(self, config_or_platform: ExperimentConfig | str | list[str] | None
             self.config = config_or_platform
 
     def _start_impl(self, port: int, debug: bool, run_mode: RunMode,
-                    url_prefix: str | None,
                     tuner_command_channel: str | None,
                     tags: list[str] = []) -> ExperimentConfig:
         assert self.config is not None
@@ -106,7 +105,7 @@ def _start_impl(self, port: int, debug: bool, run_mode: RunMode,
         nni.runtime.log.start_experiment_log(self.id, log_dir, debug)
 
         self._proc = launcher.start_experiment(self._action, self.id, config, port, debug, run_mode,
-                                               url_prefix, tuner_command_channel, tags)
+                                               self.url_prefix, tuner_command_channel, tags)
         assert self._proc is not None
 
         self.port = port  # port will be None if start up failed
@@ -137,7 +136,7 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo
         run_mode
             Running the experiment in foreground or background
         """
-        self._start_impl(port, debug, run_mode, self.url_prefix, None, [])
+        self._start_impl(port, debug, run_mode, None, [])
 
     def _stop_impl(self) -> None:
         atexit.unregister(self.stop)
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 6f4aeef3c3..2f81d781b5 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -249,7 +249,7 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None:
             raise ValueError(f'Unsupported engine type: {config.execution_engine}')
         set_execution_engine(engine)
 
-    def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None:
+    def start(self, *args, **kwargs) -> None:
         """
         By design, the only different between `start` and `run` is that `start` is asynchronous,
         while `run` waits the experiment to complete. RetiariiExperiment always waits the experiment
@@ -289,10 +289,10 @@ def run(self,
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
             ws_url = f'ws://localhost:{port}/tuner'
-            canonicalized_config = self._start_impl(port, debug, RunMode.Background, None, ws_url, ['retiarii'])
+            canonicalized_config = self._start_impl(port, debug, RunMode.Background, ws_url, ['retiarii'])
             canonicalized_config = cast(RetiariiExeConfig, canonicalized_config)
             self._dispatcher = RetiariiAdvisor(ws_url)
-            self._dispatcher_thread = Thread(target=self._dispatcher.run)
+            self._dispatcher_thread = Thread(target=self._dispatcher.run, daemon=True)
             self._dispatcher_thread.start()
             # FIXME: engine cannot be created twice
             self._create_execution_engine(canonicalized_config)

From 6e9ca3533719b3d597c064256bc9b7290fddcf7a Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 23 May 2022 12:29:09 +0800
Subject: [PATCH 30/77] add comment

---
 ts/nni_manager/core/nnimanager.ts | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts
index 54a42760cc..8bce93e860 100644
--- a/ts/nni_manager/core/nnimanager.ts
+++ b/ts/nni_manager/core/nnimanager.ts
@@ -303,6 +303,9 @@ class NNIManager implements Manager {
         }
 
         this.trainingService.removeTrialJobMetricListener(this.trialJobMetricListener);
+        // NOTE: this sending TERMINATE should be out of the if clause,
+        // because when python dispatcher is started before nnimanager
+        // this.dispatcherPid would not have a valid value (i.e., not >0).
         this.dispatcher.sendCommand(TERMINATE);
         if (this.dispatcherPid > 0) {
             // gracefully terminate tuner and assessor here, wait at most 30 seconds.

From 3e4a84a21dd1f9133faffd2336f3a53c980f9d0c Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 23 May 2022 21:20:50 +0800
Subject: [PATCH 31/77] fix bug

---
 nni/experiment/config/base.py                       |  5 +++++
 nni/experiment/launcher.py                          |  1 +
 nni/retiarii/execution/base.py                      |  1 +
 nni/retiarii/execution/cgo_engine.py                |  1 +
 nni/retiarii/experiment/config/experiment_config.py | 12 +++++++-----
 test/retiarii_test/cgo_mnasnet/base_mnasnet.py      |  1 -
 test/retiarii_test/cgo_mnasnet/test.py              |  8 +++-----
 7 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/nni/experiment/config/base.py b/nni/experiment/config/base.py
index f3d44e063f..5fb064800a 100644
--- a/nni/experiment/config/base.py
+++ b/nni/experiment/config/base.py
@@ -54,6 +54,11 @@ class ExperimentConfig(ConfigBase):
     Config objects will remember where they are loaded; therefore relative paths can be resolved smartly.
     If a config object is created with constructor, the base path will be current working directory.
     If it is loaded with ``ConfigBase.load(path)``, the base path will be ``path``'s parent.
+    
+    .. attention::
+
+        All the classes that inherit ``ConfigBase`` are not allowed to use ``from __future__ import annotations``,
+        because ``ConfigBase`` uses ``typeguard`` to perform runtime check and it does not support lazy annotations.
     """
 
     def __init__(self, **kwargs):
diff --git a/nni/experiment/launcher.py b/nni/experiment/launcher.py
index eac6796e0e..dba5b026e5 100644
--- a/nni/experiment/launcher.py
+++ b/nni/experiment/launcher.py
@@ -137,6 +137,7 @@ def start_experiment(
         )
 
         _logger.info('Setting up...')
+        print('zql: ', config.json())
         rest.post(port, '/experiment', config.json(), url_prefix)
 
     except Exception as e:
diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py
index c35d357ad0..a45299065d 100644
--- a/nni/retiarii/execution/base.py
+++ b/nni/retiarii/execution/base.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+
 from __future__ import annotations
 
 import logging
diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py
index f2d149a1d8..b959c54a4f 100644
--- a/nni/retiarii/execution/cgo_engine.py
+++ b/nni/retiarii/execution/cgo_engine.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+
 from __future__ import annotations
 
 import logging
diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
index 945bf3704e..18869e90f0 100644
--- a/nni/retiarii/experiment/config/experiment_config.py
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -1,10 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
-from __future__ import annotations
 
 import os
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Union
 
 from nni.experiment.config import utils, ExperimentConfig
 
@@ -32,10 +31,10 @@ class RetiariiExeConfig(ExperimentConfig):
     trial_code_directory: utils.PathLike = '.'
     trial_command: str = '_reserved'
     # new config field for NAS
-    execution_engine: str | ExecutionEngineConfig
+    execution_engine: Union[str, ExecutionEngineConfig]
 
-    def __init__(self, training_service_platform: str | None = None,
-                 execution_engine: str | ExecutionEngineConfig = 'py',
+    def __init__(self, training_service_platform: Union[str, None] = None,
+                 execution_engine: Union[str, ExecutionEngineConfig] = 'py',
                  **kwargs):
         super().__init__(training_service_platform, **kwargs)
         self.execution_engine = execution_engine
@@ -44,6 +43,7 @@ def _canonicalize(self, _parents):
         msg = '{} is not supposed to be set in Retiarii experiment by users, your config is {}.'
         if self.search_space != '':
             raise ValueError(msg.format('search_space', self.search_space))
+        # TODO: maybe we should also allow users to specify trial_code_directory
         if str(self.trial_code_directory) != '.' and not os.path.isabs(self.trial_code_directory):
             raise ValueError(msg.format('trial_code_directory', self.trial_code_directory))
         if self.trial_command != '_reserved' and \
@@ -53,6 +53,8 @@ def _canonicalize(self, _parents):
         if isinstance(self.execution_engine, str):
             self.execution_engine = execution_engine_config_factory(self.execution_engine)
         if self.execution_engine.name in ('py', 'base', 'cgo'):
+            # TODO: replace python3 with more elegant approach
+            # maybe use sys.executable rendered in trial side (e.g., trial_runner)
             self.trial_command = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name
 
         super()._canonicalize([self])
diff --git a/test/retiarii_test/cgo_mnasnet/base_mnasnet.py b/test/retiarii_test/cgo_mnasnet/base_mnasnet.py
index 3e76d0bf7c..3cbb7f6c04 100644
--- a/test/retiarii_test/cgo_mnasnet/base_mnasnet.py
+++ b/test/retiarii_test/cgo_mnasnet/base_mnasnet.py
@@ -4,7 +4,6 @@
 
 import torch
 import torch.nn as torch_nn
-from torchvision.models.utils import load_state_dict_from_url
 import torch.nn.functional as F
 
 import sys
diff --git a/test/retiarii_test/cgo_mnasnet/test.py b/test/retiarii_test/cgo_mnasnet/test.py
index eac4956f3f..651591d514 100644
--- a/test/retiarii_test/cgo_mnasnet/test.py
+++ b/test/retiarii_test/cgo_mnasnet/test.py
@@ -8,7 +8,7 @@
 from nni.retiarii import serialize
 from base_mnasnet import MNASNet
 from nni.experiment import RemoteMachineConfig
-from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
+from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig, CgoEngineConfig
 from nni.retiarii.strategy import TPEStrategy
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
@@ -59,8 +59,6 @@
     exp_config.max_trial_number = 10
     exp_config.trial_gpu_number = 1
     exp_config.training_service.reuse_mode = True
-    exp_config.max_concurrency_cgo = 3
-    exp_config.batch_waiting_time = 0
 
     rm_conf = RemoteMachineConfig()
     rm_conf.host = '127.0.0.1'
@@ -73,6 +71,6 @@
     rm_conf.max_trial_number_per_gpu = 3
     
     exp_config.training_service.machine_list = [rm_conf]
-    exp_config.execution_engine = 'cgo'
+    exp_config.execution_engine = CgoEngineConfig(max_concurrency_cgo = 3, batch_waiting_time = 0)
 
-    exp.run(exp_config, 8099)
\ No newline at end of file
+    exp.run(exp_config, 8099)

From b6876eb7431b7593b56c992538e58e122795be90 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 23 May 2022 21:23:16 +0800
Subject: [PATCH 32/77] remove print

---
 nni/experiment/launcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nni/experiment/launcher.py b/nni/experiment/launcher.py
index dba5b026e5..eac6796e0e 100644
--- a/nni/experiment/launcher.py
+++ b/nni/experiment/launcher.py
@@ -137,7 +137,6 @@ def start_experiment(
         )
 
         _logger.info('Setting up...')
-        print('zql: ', config.json())
         rest.post(port, '/experiment', config.json(), url_prefix)
 
     except Exception as e:

From 10553997a8d75d1c49d30b81e0efe33a1954c731 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 23 May 2022 21:39:24 +0800
Subject: [PATCH 33/77] remove trailing whitespace

---
 nni/experiment/config/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nni/experiment/config/base.py b/nni/experiment/config/base.py
index 5fb064800a..ab8b6f0619 100644
--- a/nni/experiment/config/base.py
+++ b/nni/experiment/config/base.py
@@ -54,7 +54,7 @@ class ExperimentConfig(ConfigBase):
     Config objects will remember where they are loaded; therefore relative paths can be resolved smartly.
     If a config object is created with constructor, the base path will be current working directory.
     If it is loaded with ``ConfigBase.load(path)``, the base path will be ``path``'s parent.
-    
+
     .. attention::
 
         All the classes that inherit ``ConfigBase`` are not allowed to use ``from __future__ import annotations``,

From e0be690fc298e17c1d94d587e973dd641b20c858 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 7 Jun 2022 11:24:51 +0800
Subject: [PATCH 34/77] fix not exist issue

---
 nni/runtime/msg_dispatcher_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py
index 6873ff2409..e7717874eb 100644
--- a/nni/runtime/msg_dispatcher_base.py
+++ b/nni/runtime/msg_dispatcher_base.py
@@ -46,8 +46,8 @@ def __init__(self, command_channel_url=None):
             self._channel.connect()
         self.default_command_queue = Queue()
         self.assessor_command_queue = Queue()
-        self.default_worker = threading.Thread(target=self.command_queue_worker, args=(self.default_command_queue,))
-        self.assessor_worker = threading.Thread(target=self.command_queue_worker, args=(self.assessor_command_queue,))
+        self.default_worker = threading.Thread(target=self.command_queue_worker, args=(self.default_command_queue,), daemon=True)
+        self.assessor_worker = threading.Thread(target=self.command_queue_worker, args=(self.assessor_command_queue,), daemon=True)
         self.worker_exceptions = []
 
     def run(self):

From 811e44ec1e0f79733b47a509e009df6b7c6c3d77 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Wed, 8 Jun 2022 15:04:21 +0800
Subject: [PATCH 35/77] add unittest

---
 test/ut/retiarii/test_multitrial.py | 44 +++++++++++++++++++++++++++++
 test/ut/retiarii/test_oneshot.py    | 13 +++++----
 2 files changed, 51 insertions(+), 6 deletions(-)
 create mode 100644 test/ut/retiarii/test_multitrial.py

diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py
new file mode 100644
index 0000000000..2f4259bad8
--- /dev/null
+++ b/test/ut/retiarii/test_multitrial.py
@@ -0,0 +1,44 @@
+import argparse
+import torch.nn.functional as F
+
+from nni.retiarii import strategy
+from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment
+from .test_oneshot import _mnist_net
+
+
+def test_multi_trial():
+    evaluator_kwargs = {
+        'max_epochs': 1
+    }
+
+    to_test = [
+        # (model, evaluator)
+        _mnist_net('simple', evaluator_kwargs),
+        _mnist_net('simple_value_choice', evaluator_kwargs),
+        _mnist_net('value_choice', evaluator_kwargs),
+        _mnist_net('repeat', evaluator_kwargs),
+        _mnist_net('custom_op', evaluator_kwargs),
+    ]
+
+    for base_model, evaluator in to_test:
+        search_strategy = strategy.Random()
+        exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy)
+        exp_config = RetiariiExeConfig('local')
+        exp_config.experiment_name = 'mnist_unittest'
+        exp_config.trial_concurrency = 2
+        exp_config.max_trial_number = 2
+        exp_config.training_service.use_active_gpu = False
+        exp.run(exp_config, 8081)
+        assert isinstance(exp.export_top_models()[0], dict)
+        exp.stop()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--exp', type=str, default='all', metavar='E',
+                        help='experiment to run, default = all')
+    args = parser.parse_args()
+
+    if args.exp == 'all':
+        test_multi_trial()
+    else:
+        globals()[f'test_{args.exp}']()
diff --git a/test/ut/retiarii/test_oneshot.py b/test/ut/retiarii/test_oneshot.py
index 68afb7204c..5b84b61773 100644
--- a/test/ut/retiarii/test_oneshot.py
+++ b/test/ut/retiarii/test_oneshot.py
@@ -7,6 +7,7 @@
 from torchvision.datasets import MNIST
 from torch.utils.data import Dataset, RandomSampler
 
+import nni
 import nni.retiarii.nn.pytorch as nn
 from nni.retiarii import strategy, model_wrapper, basic_unit
 from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment
@@ -216,13 +217,13 @@ def _mnist_net(type_, evaluator_kwargs):
         raise ValueError(f'Unsupported type: {type_}')
     
     transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
-    train_dataset = MNIST('data/mnist', train=True, download=True, transform=transform)
+    train_dataset = nni.trace(MNIST)('data/mnist', train=True, download=True, transform=transform)
     # Multi-GPU combined dataloader will break this subset sampler. Expected though.
-    train_random_sampler = RandomSampler(train_dataset, True, int(len(train_dataset) / 20))
-    train_loader = DataLoader(train_dataset, 64, sampler=train_random_sampler)
-    valid_dataset = MNIST('data/mnist', train=False, download=True, transform=transform)
-    valid_random_sampler = RandomSampler(valid_dataset, True, int(len(valid_dataset) / 20))
-    valid_loader = DataLoader(valid_dataset, 64, sampler=valid_random_sampler)
+    train_random_sampler = nni.trace(RandomSampler)(train_dataset, True, int(len(train_dataset) / 20))
+    train_loader = nni.trace(DataLoader)(train_dataset, 64, sampler=train_random_sampler)
+    valid_dataset = nni.trace(MNIST)('data/mnist', train=False, download=True, transform=transform)
+    valid_random_sampler = nni.trace(RandomSampler)(valid_dataset, True, int(len(valid_dataset) / 20))
+    valid_loader = nni.trace(DataLoader)(valid_dataset, 64, sampler=valid_random_sampler)
     evaluator = Classification(train_dataloader=train_loader, val_dataloaders=valid_loader, **evaluator_kwargs)
 
     return base_model, evaluator

From bc849a1446dbb0604b1c80fd079a1a7c880d7040 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Wed, 8 Jun 2022 17:14:41 +0800
Subject: [PATCH 36/77] add one more test

---
 test/ut/retiarii/test_multitrial.py | 31 +++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py
index 2f4259bad8..689284858a 100644
--- a/test/ut/retiarii/test_multitrial.py
+++ b/test/ut/retiarii/test_multitrial.py
@@ -1,5 +1,6 @@
 import argparse
-import torch.nn.functional as F
+import os
+from subprocess import Popen
 
 from nni.retiarii import strategy
 from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment
@@ -28,10 +29,35 @@ def test_multi_trial():
         exp_config.trial_concurrency = 2
         exp_config.max_trial_number = 2
         exp_config.training_service.use_active_gpu = False
-        exp.run(exp_config, 8081)
+        exp.run(exp_config, 8080)
         assert isinstance(exp.export_top_models()[0], dict)
         exp.stop()
 
+python_script = """
+from nni.retiarii import strategy
+from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment
+from test_oneshot import _mnist_net
+
+base_model, evaluator = _mnist_net('simple', {'max_epochs': 1})
+search_strategy = strategy.Random()
+exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy)
+exp_config = RetiariiExeConfig('local')
+exp_config.experiment_name = 'mnist_unittest'
+exp_config.trial_concurrency = 2
+exp_config.max_trial_number = 2
+exp_config.training_service.use_active_gpu = False
+exp.run(exp_config, 8080)
+assert isinstance(exp.export_top_models()[0], dict)
+"""
+
+def test_exp_exit_without_stop():
+    script_name = 'tmp_multi_trial.py'
+    with open(script_name, 'w') as f:
+        f.write(python_script)
+    proc = Popen(['python3', script_name])
+    proc.wait()
+    os.remove(script_name)
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--exp', type=str, default='all', metavar='E',
@@ -40,5 +66,6 @@ def test_multi_trial():
 
     if args.exp == 'all':
         test_multi_trial()
+        test_exp_exit_without_stop()
     else:
         globals()[f'test_{args.exp}']()

From 77ae20b50b926a6a9ddf5b562987ef8fead5adb2 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Wed, 8 Jun 2022 17:54:40 +0800
Subject: [PATCH 37/77] resolve comments

---
 nni/runtime/msg_dispatcher_base.py  | 2 ++
 test/ut/retiarii/test_multitrial.py | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py
index e7717874eb..99e6c71c91 100644
--- a/nni/runtime/msg_dispatcher_base.py
+++ b/nni/runtime/msg_dispatcher_base.py
@@ -46,6 +46,8 @@ def __init__(self, command_channel_url=None):
             self._channel.connect()
         self.default_command_queue = Queue()
         self.assessor_command_queue = Queue()
+        # here daemon should be True, because their parent thread is configured as daemon to enable smooth exit of NAS experiment.
+        # if daemon is not set, these threads will block the daemon effect of their parent thread.
         self.default_worker = threading.Thread(target=self.command_queue_worker, args=(self.default_command_queue,), daemon=True)
         self.assessor_worker = threading.Thread(target=self.command_queue_worker, args=(self.assessor_command_queue,), daemon=True)
         self.worker_exceptions = []
diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py
index 689284858a..a309fa2292 100644
--- a/test/ut/retiarii/test_multitrial.py
+++ b/test/ut/retiarii/test_multitrial.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+import sys
 from subprocess import Popen
 
 from nni.retiarii import strategy
@@ -54,7 +55,7 @@ def test_exp_exit_without_stop():
     script_name = 'tmp_multi_trial.py'
     with open(script_name, 'w') as f:
         f.write(python_script)
-    proc = Popen(['python3', script_name])
+    proc = Popen([sys.executable, script_name])
     proc.wait()
     os.remove(script_name)
 

From b664a0af9928b804701d807a84421bdf5ced730f Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Thu, 9 Jun 2022 15:08:00 +0800
Subject: [PATCH 38/77] update

---
 test/ut/retiarii/test_multitrial.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py
index a309fa2292..467b6a6fdb 100644
--- a/test/ut/retiarii/test_multitrial.py
+++ b/test/ut/retiarii/test_multitrial.py
@@ -27,10 +27,11 @@ def test_multi_trial():
         exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy)
         exp_config = RetiariiExeConfig('local')
         exp_config.experiment_name = 'mnist_unittest'
-        exp_config.trial_concurrency = 2
-        exp_config.max_trial_number = 2
+        exp_config.trial_concurrency = 1
+        exp_config.max_trial_number = 1
         exp_config.training_service.use_active_gpu = False
         exp.run(exp_config, 8080)
+        print(exp.export_top_models())
         assert isinstance(exp.export_top_models()[0], dict)
         exp.stop()
 
@@ -44,8 +45,8 @@ def test_multi_trial():
 exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy)
 exp_config = RetiariiExeConfig('local')
 exp_config.experiment_name = 'mnist_unittest'
-exp_config.trial_concurrency = 2
-exp_config.max_trial_number = 2
+exp_config.trial_concurrency = 1
+exp_config.max_trial_number = 1
 exp_config.training_service.use_active_gpu = False
 exp.run(exp_config, 8080)
 assert isinstance(exp.export_top_models()[0], dict)

From ecf87c32904b07993a822aa2ac74d0bd9c3f5ce6 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Thu, 9 Jun 2022 15:54:20 +0800
Subject: [PATCH 39/77] fix pipeline

---
 test/ut/retiarii/test_multitrial.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py
index 467b6a6fdb..2b604f179b 100644
--- a/test/ut/retiarii/test_multitrial.py
+++ b/test/ut/retiarii/test_multitrial.py
@@ -1,12 +1,15 @@
 import argparse
 import os
 import sys
+import pytorch_lightning as pl
+import pytest
 from subprocess import Popen
 
 from nni.retiarii import strategy
 from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment
 from .test_oneshot import _mnist_net
 
+pytestmark = pytest.mark.skipif(pl.__version__ < '1.0', reason='Incompatible APIs')
 
 def test_multi_trial():
     evaluator_kwargs = {
@@ -31,7 +34,6 @@ def test_multi_trial():
         exp_config.max_trial_number = 1
         exp_config.training_service.use_active_gpu = False
         exp.run(exp_config, 8080)
-        print(exp.export_top_models())
         assert isinstance(exp.export_top_models()[0], dict)
         exp.stop()
 

From 49fa868d15b17e5ad1d611aeecda30dc3e5a7da2 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Fri, 10 Jun 2022 09:45:40 +0800
Subject: [PATCH 40/77] add timeout for one test

---
 test/ut/retiarii/test_multitrial.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py
index 2b604f179b..04812368da 100644
--- a/test/ut/retiarii/test_multitrial.py
+++ b/test/ut/retiarii/test_multitrial.py
@@ -54,6 +54,7 @@ def test_multi_trial():
 assert isinstance(exp.export_top_models()[0], dict)
 """
 
+@pytest.mark.timeout(600)
 def test_exp_exit_without_stop():
     script_name = 'tmp_multi_trial.py'
     with open(script_name, 'w') as f:

From fc99b433e72eaf73fe20d8741d0672d9e57a6e38 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Wed, 15 Jun 2022 15:42:11 +0800
Subject: [PATCH 41/77] release note

---
 docs/source/release.rst | 57 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/docs/source/release.rst b/docs/source/release.rst
index 1d7cb53956..b499df3552 100644
--- a/docs/source/release.rst
+++ b/docs/source/release.rst
@@ -5,6 +5,63 @@
 Change Log
 ==========
 
+Release 2.8 - 6/16/2022
+-----------------------
+
+Neural Architecture Search
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Align user experience of one-shot NAS with multi-trial NAS, i.e., users can use one-shot NAS by specifying the corresponding strategy
+* *Preview* Support load/retrain the pre-searched model of some search spaces, i.e., 18 models in 4 different search spaces
+* Support AutoFormer search space in search space hub, thanks our collaborators xxx, xxx
+* Support multi-GPU training of one-shot NAS
+* One-shot NAS supports the NAS API `repeat` and `cell`
+* Refactor of RetiariiExperiment to share the common implementation with HPO experiment
+* CGO supports pytorch-lightning 1.6
+
+Model Compression
+^^^^^^^^^^^^^^^^^
+
+* *Preview* Refactor and improvement of automatic model compress with a new `CompressionExperiment`
+* Support customizating module replacement function for unsupported modules in model speedup
+* Support the module replacement function for some user mentioned modules
+* Support output_padding for contransposed2d in model speedup, thanks external contributor xxx
+
+Hyper-Parameter Optimization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Make config.tuner.name case insensitive
+* Allow writing configurations of advisor in tuner format, i.e., aligning the configuration of advisor and tuner
+
+Experiment
+^^^^^^^^^^
+
+* Support launching multiple HPO experiments in one process
+* Refactor of the logging mechanism in NNI
+* Refactor of NNI manager globals for flexible and high externsibility
+* Migrate dispatcher IPC to WebSocket
+* Decouple lock stuffs from experiments manager logic
+* Use launcher's sys.executable to detect Python interpreter
+
+WebUI
+^^^^^
+
+* Improve user experience of trial ordering in the overview page
+* Fix the update issue in the trial detail page
+
+Documentation
+^^^^^^^^^^^^^
+
+* A new translation framework for document
+
+Notable Bugfixes
+^^^^^^^^^^^^^^^^
+
+* Fix TPE import issue for old metrics
+* Fix the issue in TPE nested search space
+* Support `RecursiveScriptModule` in speedup
+* Fix the issue of failed "implicit type cast" in merge_parameter()
+
 Release 2.7 - 4/18/2022
 -----------------------
 

From a94cdf9a769b7803f6a0e7711fe3a8beb8b52fb6 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 21 Jun 2022 18:56:26 +0800
Subject: [PATCH 42/77] resolve comments

---
 README.md               |  2 +-
 docs/source/conf.py     |  2 +-
 docs/source/release.rst | 29 ++++++++++++++++-------------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index f5009384e2..53fb2d5ccf 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ NNI automates feature engineering, neural architecture search, hyperparameter tu
 
 ## What's NEW! &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>
 
-* **New release**: [v2.7 is available](https://github.com/microsoft/nni/releases/tag/v2.7) - _released on Apr-18-2022_
+* **New release**: [v2.8 is available](https://github.com/microsoft/nni/releases/tag/v2.8) - _released on Apr-18-2022_
 * **New demo available**: [Youtube entry](https://www.youtube.com/channel/UCKcafm6861B2mnYhPbZHavw) | [Bilibili 入口](https://space.bilibili.com/1649051673) - _last updated on Apr-18-2022_
 * **New webinar**: [Introducing Retiarii: A deep learning exploratory-training framework on NNI](https://note.microsoft.com/MSR-Webinar-Retiarii-Registration-Live.html) - _scheduled on June-24-2021_
 * **Newly upgraded documentation**: [Doc upgraded](https://nni.readthedocs.io/en/stable)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 882e70b01f..3b996be064 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -31,7 +31,7 @@
 version = ''
 # The full version, including alpha/beta/rc tags
 # FIXME: this should be written somewhere globally
-release = 'v2.7'
+release = 'v2.8'
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/docs/source/release.rst b/docs/source/release.rst
index b499df3552..aade4a97c0 100644
--- a/docs/source/release.rst
+++ b/docs/source/release.rst
@@ -5,43 +5,45 @@
 Change Log
 ==========
 
-Release 2.8 - 6/16/2022
+Release 2.8 - 6/21/2022
 -----------------------
 
 Neural Architecture Search
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 * Align user experience of one-shot NAS with multi-trial NAS, i.e., users can use one-shot NAS by specifying the corresponding strategy
-* *Preview* Support load/retrain the pre-searched model of some search spaces, i.e., 18 models in 4 different search spaces
-* Support AutoFormer search space in search space hub, thanks our collaborators xxx, xxx
 * Support multi-GPU training of one-shot NAS
-* One-shot NAS supports the NAS API `repeat` and `cell`
+* *Preview* Support load/retrain the pre-searched model of some search spaces, i.e., 18 models in 4 different search spaces
+* Support AutoFormer search space in search space hub, thanks our collaborators @nbl97 and @penghouwen
+* One-shot NAS supports the NAS API ``repeat`` and ``cell``
 * Refactor of RetiariiExperiment to share the common implementation with HPO experiment
 * CGO supports pytorch-lightning 1.6
 
 Model Compression
 ^^^^^^^^^^^^^^^^^
 
-* *Preview* Refactor and improvement of automatic model compress with a new `CompressionExperiment`
+* *Preview* Refactor and improvement of automatic model compress with a new ``CompressionExperiment``
 * Support customizating module replacement function for unsupported modules in model speedup
 * Support the module replacement function for some user mentioned modules
-* Support output_padding for contransposed2d in model speedup, thanks external contributor xxx
+* Support output_padding for contransposed2d in model speedup, thanks external contributor @haoshuai-orka
 
 Hyper-Parameter Optimization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-* Make config.tuner.name case insensitive
+* Make ``config.tuner.name`` case insensitive
 * Allow writing configurations of advisor in tuner format, i.e., aligning the configuration of advisor and tuner
 
 Experiment
 ^^^^^^^^^^
 
 * Support launching multiple HPO experiments in one process
-* Refactor of the logging mechanism in NNI
-* Refactor of NNI manager globals for flexible and high externsibility
-* Migrate dispatcher IPC to WebSocket
-* Decouple lock stuffs from experiments manager logic
-* Use launcher's sys.executable to detect Python interpreter
+* Internal refactors and improvements
+
+  * Refactor of the logging mechanism in NNI
+  * Refactor of NNI manager globals for flexible and high extensibility
+  * Migrate dispatcher IPC to WebSocket
+  * Decouple lock stuffs from experiments manager logic
+  * Use launcher's sys.executable to detect Python interpreter
 
 WebUI
 ^^^^^
@@ -53,13 +55,14 @@ Documentation
 ^^^^^^^^^^^^^
 
 * A new translation framework for document
+* Add a new quantization demo (`doc <>`__)
 
 Notable Bugfixes
 ^^^^^^^^^^^^^^^^
 
 * Fix TPE import issue for old metrics
 * Fix the issue in TPE nested search space
-* Support `RecursiveScriptModule` in speedup
+* Support ``RecursiveScriptModule`` in speedup
 * Fix the issue of failed "implicit type cast" in merge_parameter()
 
 Release 2.7 - 4/18/2022

From 57a03f7cccefdd640973676dd6912726449a337a Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 21 Jun 2022 19:16:53 +0800
Subject: [PATCH 43/77] add doc links

---
 docs/source/release.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/release.rst b/docs/source/release.rst
index aade4a97c0..9eba337d5f 100644
--- a/docs/source/release.rst
+++ b/docs/source/release.rst
@@ -11,9 +11,9 @@ Release 2.8 - 6/21/2022
 Neural Architecture Search
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-* Align user experience of one-shot NAS with multi-trial NAS, i.e., users can use one-shot NAS by specifying the corresponding strategy
+* Align user experience of one-shot NAS with multi-trial NAS, i.e., users can use one-shot NAS by specifying the corresponding strategy (`doc <https://nni.readthedocs.io/en/v2.8/nas/exploration_strategy.html#one-shot-strategy>`__)
 * Support multi-GPU training of one-shot NAS
-* *Preview* Support load/retrain the pre-searched model of some search spaces, i.e., 18 models in 4 different search spaces
+* *Preview* Support load/retrain the pre-searched model of some search spaces, i.e., 18 models in 4 different search spaces (`doc <https://github.com/microsoft/nni/tree/v2.8/nni/retiarii/hub>`__)
 * Support AutoFormer search space in search space hub, thanks our collaborators @nbl97 and @penghouwen
 * One-shot NAS supports the NAS API ``repeat`` and ``cell``
 * Refactor of RetiariiExperiment to share the common implementation with HPO experiment
@@ -55,7 +55,7 @@ Documentation
 ^^^^^^^^^^^^^
 
 * A new translation framework for document
-* Add a new quantization demo (`doc <>`__)
+* Add a new quantization demo (`doc <https://nni.readthedocs.io/en/v2.8/tutorials/quantization_quick_start_mnist.html>`__)
 
 Notable Bugfixes
 ^^^^^^^^^^^^^^^^

From 09ba2c52a608bc979aee8ab98edb864b65ef06e1 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Wed, 22 Jun 2022 08:30:06 +0800
Subject: [PATCH 44/77] update

---
 README.md               | 4 ++--
 docs/source/release.rst | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 53fb2d5ccf..b384ef1299 100644
--- a/README.md
+++ b/README.md
@@ -20,8 +20,8 @@ NNI automates feature engineering, neural architecture search, hyperparameter tu
 
 ## What's NEW! &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>
 
-* **New release**: [v2.8 is available](https://github.com/microsoft/nni/releases/tag/v2.8) - _released on Apr-18-2022_
-* **New demo available**: [Youtube entry](https://www.youtube.com/channel/UCKcafm6861B2mnYhPbZHavw) | [Bilibili 入口](https://space.bilibili.com/1649051673) - _last updated on Apr-18-2022_
+* **New release**: [v2.8 is available](https://github.com/microsoft/nni/releases/tag/v2.8) - _released on June-22-2022_
+* **New demo available**: [Youtube entry](https://www.youtube.com/channel/UCKcafm6861B2mnYhPbZHavw) | [Bilibili 入口](https://space.bilibili.com/1649051673) - _last updated on June-22-2022_
 * **New webinar**: [Introducing Retiarii: A deep learning exploratory-training framework on NNI](https://note.microsoft.com/MSR-Webinar-Retiarii-Registration-Live.html) - _scheduled on June-24-2021_
 * **Newly upgraded documentation**: [Doc upgraded](https://nni.readthedocs.io/en/stable)
 
diff --git a/docs/source/release.rst b/docs/source/release.rst
index 9eba337d5f..c27575257c 100644
--- a/docs/source/release.rst
+++ b/docs/source/release.rst
@@ -5,7 +5,7 @@
 Change Log
 ==========
 
-Release 2.8 - 6/21/2022
+Release 2.8 - 6/22/2022
 -----------------------
 
 Neural Architecture Search
@@ -23,9 +23,9 @@ Model Compression
 ^^^^^^^^^^^^^^^^^
 
 * *Preview* Refactor and improvement of automatic model compress with a new ``CompressionExperiment``
-* Support customizating module replacement function for unsupported modules in model speedup
+* Support customizating module replacement function for unsupported modules in model speedup (`doc <https://nni.readthedocs.io/en/v2.8/reference/compression/pruning_speedup.html#nni.compression.pytorch.speedup.ModelSpeedup>`__)
 * Support the module replacement function for some user mentioned modules
-* Support output_padding for contransposed2d in model speedup, thanks external contributor @haoshuai-orka
+* Support output_padding for convtranspose2d in model speedup, thanks external contributor @haoshuai-orka
 
 Hyper-Parameter Optimization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^

From 72f3ff8500a5c0e6a2c4654c246f2f0827bdbd83 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 5 Jul 2022 22:36:57 +0800
Subject: [PATCH 45/77] nas experiment view

---
 nni/experiment/config/experiment_config.py    |  1 +
 nni/experiment/config/utils/internal.py       | 19 +++++-
 nni/experiment/launcher.py                    |  3 +-
 .../experiment/config/experiment_config.py    | 59 +++++++++++--------
 nni/tools/nnictl/launcher.py                  |  2 +-
 5 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/nni/experiment/config/experiment_config.py b/nni/experiment/config/experiment_config.py
index a9bae4c0da..e5124832dd 100644
--- a/nni/experiment/config/experiment_config.py
+++ b/nni/experiment/config/experiment_config.py
@@ -61,6 +61,7 @@ class ExperimentConfig(ConfigBase):
     # In latter case hybrid training services can have different settings.
 
     experiment_name: Optional[str] = None
+    experiment_type: str = 'hpo'
     search_space_file: Optional[utils.PathLike] = None
     search_space: Any = None
     trial_command: Optional[str] = None  # training service field
diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py
index d6e55ece01..1862fffc83 100644
--- a/nni/experiment/config/utils/internal.py
+++ b/nni/experiment/config/utils/internal.py
@@ -15,7 +15,7 @@
     'fields', 'is_instance', 'validate_type', 'is_path_like',
     'guess_config_type', 'guess_list_config_type',
     'training_service_config_factory', 'load_training_service_config',
-    'get_ipv4_address'
+    'get_ipv4_address', 'init_experiment_config'
 ]
 
 import copy
@@ -197,3 +197,20 @@ def get_ipv4_address() -> str:
     addr = s.getsockname()[0]
     s.close()
     return addr
+
+def init_experiment_config(config_json) -> ConfigBase:
+    from ..experiment_config import ExperimentConfig
+    from nni.retiarii.experiment.config.experiment_config import RetiariiExeConfig
+    if 'experimentType' in config_json:
+        if config_json['experimentType'] == 'hpo':
+            return ExperimentConfig(**config_json)
+        elif config_json['experimentType'] == 'nas':
+            return RetiariiExeConfig(**config_json)
+        else:
+            raise KeyError(f'Unknown experiment_type: {config_json["experimentType"]}')
+    else:
+        # for backward compatibility, experiment config <= v2.8 does not have "experiment_type"
+        if 'executionEngine' in config_json:
+            return RetiariiExeConfig(**config_json)
+        else:
+            return ExperimentConfig(**config_json)
diff --git a/nni/experiment/launcher.py b/nni/experiment/launcher.py
index 8ad250a59b..9c50bd92e3 100644
--- a/nni/experiment/launcher.py
+++ b/nni/experiment/launcher.py
@@ -18,6 +18,7 @@
 from typing_extensions import Literal
 
 from .config import ExperimentConfig
+from .config.utils import init_experiment_config
 from . import rest
 from ..tools.nnictl.config_utils import Experiments, Config
 from ..tools.nnictl.nnictl_utils import update_experiment
@@ -203,7 +204,7 @@ def _save_experiment_information(experiment_id: str, port: int, start_time: int,
 
 def get_stopped_experiment_config(exp_id, exp_dir=None):
     config_json = get_stopped_experiment_config_json(exp_id, exp_dir)  # type: ignore
-    config = ExperimentConfig(**config_json)  # type: ignore
+    config = init_experiment_config(config_json)  # type: ignore
     if exp_dir and not os.path.samefile(exp_dir, config.experiment_working_directory):
         msg = 'Experiment working directory provided in command line (%s) is different from experiment config (%s)'
         _logger.warning(msg, exp_dir, config.experiment_working_directory)
diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
index 4eb5ac82a0..e79347cd29 100644
--- a/nni/retiarii/experiment/config/experiment_config.py
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -28,6 +28,7 @@ def _get_ee_config_class(engine_name):
 @dataclass(init=False)
 class RetiariiExeConfig(ExperimentConfig):
     # FIXME: refactor this class to inherit from a new common base class with HPO config
+    experiment_type: str = 'nas'
     search_space: Any = ''
     trial_code_directory: utils.PathLike = '.'
     trial_command: str = '_reserved'
@@ -43,33 +44,39 @@ def __init__(self, training_service_platform: Union[str, None] = None,
                  **kwargs):
         super().__init__(training_service_platform, **kwargs)
         self.execution_engine = execution_engine
+        
+        self._is_complete_config = False
+        if self.search_space != '' and self.trial_code_directory != '.' and self.trial_command != '_reserved':
+            # only experiment view and resume have complete config in init, as the config is directly loaded
+            self._is_complete_config = True
 
     def _canonicalize(self, _parents):
-        msg = '{} is not supposed to be set in Retiarii experiment by users, your config is {}.'
-        if self.search_space != '':
-            raise ValueError(msg.format('search_space', self.search_space))
-        # TODO: maybe we should also allow users to specify trial_code_directory
-        if str(self.trial_code_directory) != '.' and not os.path.isabs(self.trial_code_directory):
-            raise ValueError(msg.format('trial_code_directory', self.trial_code_directory))
-
-        trial_command_tmpl = '{envs} {python} -m nni.retiarii.trial_entry {execution_engine}'
-        if self.trial_command != '_reserved' and '-m nni.retiarii.trial_entry' not in self.trial_command:
-            raise ValueError(msg.format('trial_command', self.trial_command))
-
-        if isinstance(self.execution_engine, str):
-            self.execution_engine = execution_engine_config_factory(self.execution_engine)
-
-        _trial_command_params = {
-            # Default variables
-            'envs': '',
-            # TODO: maybe use sys.executable rendered in trial side (e.g., trial_runner)
-            'python': sys.executable,
-            'execution_engine': self.execution_engine.name,
-
-            # This should override the parameters above.
-            **(self._trial_command_params or {})
-        }
-
-        self.trial_command = trial_command_tmpl.format(**_trial_command_params).strip()
+        if not self._is_complete_config:
+            msg = '{} is not supposed to be set in Retiarii experiment by users, your config is {}.'
+            if self.search_space != '':
+                raise ValueError(msg.format('search_space', self.search_space))
+            # TODO: maybe we should also allow users to specify trial_code_directory
+            if str(self.trial_code_directory) != '.' and not os.path.isabs(self.trial_code_directory):
+                raise ValueError(msg.format('trial_code_directory', self.trial_code_directory))
+
+            trial_command_tmpl = '{envs} {python} -m nni.retiarii.trial_entry {execution_engine}'
+            if self.trial_command != '_reserved' and '-m nni.retiarii.trial_entry' not in self.trial_command:
+                raise ValueError(msg.format('trial_command', self.trial_command))
+
+            if isinstance(self.execution_engine, str):
+                self.execution_engine = execution_engine_config_factory(self.execution_engine)
+
+            _trial_command_params = {
+                # Default variables
+                'envs': '',
+                # TODO: maybe use sys.executable rendered in trial side (e.g., trial_runner)
+                'python': sys.executable,
+                'execution_engine': self.execution_engine.name,
+
+                # This should override the parameters above.
+                **(self._trial_command_params or {})
+            }
+
+            self.trial_command = trial_command_tmpl.format(**_trial_command_params).strip()
 
         super()._canonicalize([self])
diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py
index af10717654..64adc7fe1c 100644
--- a/nni/tools/nnictl/launcher.py
+++ b/nni/tools/nnictl/launcher.py
@@ -119,4 +119,4 @@ def view_experiment(args):
         exit()
 
     exp = Experiment._view(exp_id, exp_dir)
-    exp.start(port, run_mode=RunMode.Detach)
+    exp.start(port, run_mode=RunMode.Detach)
\ No newline at end of file

From e0342eb708d761c2a1e963ec3eea5f09336ae9c1 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 5 Jul 2022 22:45:28 +0800
Subject: [PATCH 46/77] minor

---
 nni/tools/nnictl/launcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py
index 64adc7fe1c..af10717654 100644
--- a/nni/tools/nnictl/launcher.py
+++ b/nni/tools/nnictl/launcher.py
@@ -119,4 +119,4 @@ def view_experiment(args):
         exit()
 
     exp = Experiment._view(exp_id, exp_dir)
-    exp.start(port, run_mode=RunMode.Detach)
\ No newline at end of file
+    exp.start(port, run_mode=RunMode.Detach)

From 99374f3756139126d4a32e54b3210d3374918cc4 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Thu, 7 Jul 2022 20:01:49 +0800
Subject: [PATCH 47/77] support nas experiment resume

---
 nni/retiarii/execution/utils.py               | 37 +++++++++++++-
 .../experiment/config/experiment_config.py    | 35 ++++++++++---
 nni/retiarii/experiment/pytorch.py            | 51 +++++++++----------
 nni/retiarii/integration.py                   |  4 ++
 4 files changed, 90 insertions(+), 37 deletions(-)

diff --git a/nni/retiarii/execution/utils.py b/nni/retiarii/execution/utils.py
index 7615fb9988..5404cab2fc 100644
--- a/nni/retiarii/execution/utils.py
+++ b/nni/retiarii/execution/utils.py
@@ -1,8 +1,15 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from typing import Any, List
+from typing import Any, List, cast
+
+from nni.experiment.config.training_services import RemoteConfig
+from .interface import AbstractExecutionEngine
 from ..graph import Model
+from ..experiment.config import (
+    BaseEngineConfig, PyEngineConfig,
+    CgoEngineConfig, BenchmarkEngineConfig
+)
 
 def _unpack_if_only_one(ele: List[Any]):
     if len(ele) == 1:
@@ -26,3 +33,31 @@ def mutation_dict_to_summary(mutation: dict) -> dict:
 def get_mutation_summary(model: Model) -> dict:
     mutation = get_mutation_dict(model)
     return mutation_dict_to_summary(mutation)
+
+def init_execution_engine(config, port, url_prefix) -> AbstractExecutionEngine:
+    if isinstance(config.execution_engine, BaseEngineConfig):
+        from .base import BaseExecutionEngine
+        return BaseExecutionEngine(port, url_prefix)
+    elif isinstance(config.execution_engine, CgoEngineConfig):
+        from .cgo_engine import CGOExecutionEngine
+
+        assert not isinstance(config.training_service, list) \
+            and config.training_service.platform == 'remote', \
+            "CGO execution engine currently only supports remote training service"
+        assert config.execution_engine.batch_waiting_time is not None \
+            and config.execution_engine.max_concurrency_cgo is not None
+        return CGOExecutionEngine(cast(RemoteConfig, config.training_service),
+                                    max_concurrency=config.execution_engine.max_concurrency_cgo,
+                                    batch_waiting_time=config.execution_engine.batch_waiting_time,
+                                    rest_port=port,
+                                    rest_url_prefix=url_prefix)
+    elif isinstance(config.execution_engine, PyEngineConfig):
+        from .python import PurePythonExecutionEngine
+        return PurePythonExecutionEngine(port, url_prefix)
+    elif isinstance(config.execution_engine, BenchmarkEngineConfig):
+        from .benchmark import BenchmarkExecutionEngine
+        assert config.execution_engine.benchmark is not None, \
+            '"benchmark" must be set when benchmark execution engine is used.'
+        return BenchmarkExecutionEngine(config.execution_engine.benchmark)
+    else:
+        raise ValueError(f'Unsupported engine type: {config.execution_engine}')
\ No newline at end of file
diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
index e79347cd29..b6f0149d09 100644
--- a/nni/retiarii/experiment/config/experiment_config.py
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -3,8 +3,8 @@
 
 import os
 import sys
-from dataclasses import dataclass
-from typing import Any, Dict, Union, Optional
+from dataclasses import dataclass, MISSING
+from typing import Any, Dict, Union, Optional, overload
 
 from nni.experiment.config import utils, ExperimentConfig
 
@@ -12,12 +12,20 @@
 
 __all__ = ['RetiariiExeConfig']
 
-def execution_engine_config_factory(engine_name):
-    # FIXME: may move this function to experiment utils in future
+# TODO: may move this function to experiment utils in future
+def init_execution_engine_config(engine_config: Union[str, dict]) -> ExecutionEngineConfig:
+    if isinstance(engine_config, str):
+        engine_name = engine_config
+    else:
+        engine_name = engine_config['name']
     cls = _get_ee_config_class(engine_name)
     if cls is None:
         raise ValueError(f'Invalid execution engine name: {engine_name}')
-    return cls()
+    engine = cls()
+    if isinstance(engine_config, dict):
+        for key, value in engine_config.items():
+            setattr(engine, key, value)
+    return engine
 
 def _get_ee_config_class(engine_name):
     for cls in ExecutionEngineConfig.__subclasses__():
@@ -43,8 +51,17 @@ def __init__(self, training_service_platform: Union[str, None] = None,
                  execution_engine: Union[str, ExecutionEngineConfig] = 'py',
                  **kwargs):
         super().__init__(training_service_platform, **kwargs)
-        self.execution_engine = execution_engine
-        
+
+        if self.execution_engine != MISSING:
+            # this branch means kwargs is not {} and self.execution_engine has been assigned in super(),
+            # reassign it because super() may instantiate ExecutionEngineConfig by mistake
+            self.execution_engine = init_execution_engine_config(kwargs['executionEngine'])
+            del kwargs['executionEngine']
+        elif isinstance(execution_engine, str):
+            self.execution_engine = init_execution_engine_config(execution_engine)
+        else:
+            self.execution_engine = execution_engine
+
         self._is_complete_config = False
         if self.search_space != '' and self.trial_code_directory != '.' and self.trial_command != '_reserved':
             # only experiment view and resume have complete config in init, as the config is directly loaded
@@ -63,8 +80,10 @@ def _canonicalize(self, _parents):
             if self.trial_command != '_reserved' and '-m nni.retiarii.trial_entry' not in self.trial_command:
                 raise ValueError(msg.format('trial_command', self.trial_command))
 
+            # this canonicalize is necessary because users may assign new execution engine str
+            # after execution engine config is instantiated
             if isinstance(self.execution_engine, str):
-                self.execution_engine = execution_engine_config_factory(self.execution_engine)
+                self.execution_engine = init_execution_engine_config(self.execution_engine)
 
             _trial_command_params = {
                 # Default variables
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 8dd6118842..31d7d550b3 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -14,6 +14,7 @@
 import torch
 import torch.nn as nn
 from nni.experiment import Experiment, RunMode
+from nni.experiment import launcher
 from nni.experiment.config.training_services import RemoteConfig
 
 from .config import (
@@ -24,7 +25,7 @@
 from ..converter import convert_to_graph
 from ..converter.graph_gen import GraphConverterWithShape
 from ..execution import list_models, set_execution_engine
-from ..execution.utils import get_mutation_dict
+from ..execution.utils import get_mutation_dict, init_execution_engine
 from ..graph import Evaluator
 from ..integration import RetiariiAdvisor
 from ..mutator import Mutator
@@ -220,33 +221,7 @@ def _run_strategy(self, config: RetiariiExeConfig):
         # TODO: find out a proper way to show no more trial message on WebUI
 
     def _create_execution_engine(self, config: RetiariiExeConfig) -> None:
-        #TODO: we will probably need a execution engine factory to make this clean and elegant
-        if isinstance(config.execution_engine, BaseEngineConfig):
-            from ..execution.base import BaseExecutionEngine
-            engine = BaseExecutionEngine(self.port, self.url_prefix)
-        elif isinstance(config.execution_engine, CgoEngineConfig):
-            from ..execution.cgo_engine import CGOExecutionEngine
-
-            assert not isinstance(config.training_service, list) \
-                and config.training_service.platform == 'remote', \
-                "CGO execution engine currently only supports remote training service"
-            assert config.execution_engine.batch_waiting_time is not None \
-                and config.execution_engine.max_concurrency_cgo is not None
-            engine = CGOExecutionEngine(cast(RemoteConfig, config.training_service),
-                                        max_concurrency=config.execution_engine.max_concurrency_cgo,
-                                        batch_waiting_time=config.execution_engine.batch_waiting_time,
-                                        rest_port=self.port,
-                                        rest_url_prefix=self.url_prefix)
-        elif isinstance(config.execution_engine, PyEngineConfig):
-            from ..execution.python import PurePythonExecutionEngine
-            engine = PurePythonExecutionEngine(self.port, self.url_prefix)
-        elif isinstance(config.execution_engine, BenchmarkEngineConfig):
-            from ..execution.benchmark import BenchmarkExecutionEngine
-            assert config.execution_engine.benchmark is not None, \
-                '"benchmark" must be set when benchmark execution engine is used.'
-            engine = BenchmarkExecutionEngine(config.execution_engine.benchmark)
-        else:
-            raise ValueError(f'Unsupported engine type: {config.execution_engine}')
+        engine = init_execution_engine(config, self.port, self.url_prefix)
         set_execution_engine(engine)
 
     def start(self, *args, **kwargs) -> None:
@@ -360,3 +335,23 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for
                 return [model_to_pytorch_script(model) for model in all_models[:top_k]]
             elif formatter == 'dict':
                 return [get_mutation_dict(model) for model in all_models[:top_k]]
+
+    def resume(self, experiment_id: str, port: int = 8080, wait_completion: bool = True, debug: bool = False):
+        """
+        Resume a stopped experiment.
+
+        Parameters
+        ----------
+        experiment_id
+            The stopped experiment id.
+        port
+            The port of web UI.
+        wait_completion
+            If true, run in the foreground. If false, run in the background.
+        debug
+            Whether to start in debug mode.
+        """
+        self.id = experiment_id
+        self._action = 'resume'
+        config = launcher.get_stopped_experiment_config(experiment_id, None)
+        self.run(config, port=port, debug=debug)
\ No newline at end of file
diff --git a/nni/retiarii/integration.py b/nni/retiarii/integration.py
index 26f479727f..f56f66308d 100644
--- a/nni/retiarii/integration.py
+++ b/nni/retiarii/integration.py
@@ -184,3 +184,7 @@ def _process_value(value) -> Any:  # hopefully a float
             else:
                 return value
         return value
+
+    def handle_import_data(self, data):
+        # FIXME: ignore imported data for now, as strategy has not supported resume
+        pass

From cbaad0f213a024e3c7374a780d024f364c2ca0ed Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Thu, 7 Jul 2022 20:12:34 +0800
Subject: [PATCH 48/77] fix pylint

---
 nni/retiarii/experiment/config/experiment_config.py | 2 +-
 nni/retiarii/experiment/pytorch.py                  | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py
index b6f0149d09..dc07518513 100644
--- a/nni/retiarii/experiment/config/experiment_config.py
+++ b/nni/retiarii/experiment/config/experiment_config.py
@@ -4,7 +4,7 @@
 import os
 import sys
 from dataclasses import dataclass, MISSING
-from typing import Any, Dict, Union, Optional, overload
+from typing import Any, Dict, Union, Optional
 
 from nni.experiment.config import utils, ExperimentConfig
 
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 31d7d550b3..7c22554d8f 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -15,7 +15,6 @@
 import torch.nn as nn
 from nni.experiment import Experiment, RunMode
 from nni.experiment import launcher
-from nni.experiment.config.training_services import RemoteConfig
 
 from .config import (
     RetiariiExeConfig, OneshotEngineConfig, BaseEngineConfig,

From 0abe5a3c8452b721f41b443f2df31014cadaa947 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 11 Jul 2022 10:31:44 +0800
Subject: [PATCH 49/77] finish main functionality

---
 nni/experiment/config/utils/internal.py |  18 +++-
 nni/retiarii/experiment/pytorch.py      | 116 ++++++++++++++++++++----
 nni/retiarii/graph.py                   |  10 +-
 nni/tools/nnictl/launcher.py            |  19 +++-
 4 files changed, 138 insertions(+), 25 deletions(-)

diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py
index 1862fffc83..4e6e11934c 100644
--- a/nni/experiment/config/utils/internal.py
+++ b/nni/experiment/config/utils/internal.py
@@ -15,7 +15,7 @@
     'fields', 'is_instance', 'validate_type', 'is_path_like',
     'guess_config_type', 'guess_list_config_type',
     'training_service_config_factory', 'load_training_service_config',
-    'get_ipv4_address', 'init_experiment_config'
+    'get_ipv4_address', 'init_experiment_config', 'get_experiment_class_using_config'
 ]
 
 import copy
@@ -214,3 +214,19 @@ def init_experiment_config(config_json) -> ConfigBase:
             return RetiariiExeConfig(**config_json)
         else:
             return ExperimentConfig(**config_json)
+
+def get_experiment_class_using_config(config_json):
+    from ...experiment import Experiment
+    from nni.retiarii.experiment.pytorch import RetiariiExperiment
+    if 'experimentType' in config_json:
+        if config_json['experimentType'] == 'hpo':
+            return Experiment
+        elif config_json['experimentType'] == 'nas':
+            return RetiariiExperiment
+        else:
+            raise KeyError(f'Unknown experiment_type: {config_json["experimentType"]}')
+    else:
+        if 'executionEngine' in config_json:
+            return RetiariiExperiment
+        else:
+            return Experiment
\ No newline at end of file
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 7c22554d8f..93ed0c649c 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -5,6 +5,8 @@
 
 import logging
 
+import os
+import time
 import warnings
 from threading import Thread
 from typing import Any, List, Union, cast
@@ -13,6 +15,7 @@
 
 import torch
 import torch.nn as nn
+from nni.common import dump, load
 from nni.experiment import Experiment, RunMode
 from nni.experiment import launcher
 
@@ -25,7 +28,7 @@
 from ..converter.graph_gen import GraphConverterWithShape
 from ..execution import list_models, set_execution_engine
 from ..execution.utils import get_mutation_dict, init_execution_engine
-from ..graph import Evaluator
+from ..graph import Evaluator, Model
 from ..integration import RetiariiAdvisor
 from ..mutator import Mutator
 from ..nn.pytorch.mutator import (
@@ -35,6 +38,7 @@
 from ..serializer import is_model_wrapped
 from ..strategy import BaseStrategy
 from ..strategy.utils import dry_run_for_formatted_search_space
+from nni.retiarii import strategy
 
 _logger = logging.getLogger(__name__)
 
@@ -186,7 +190,8 @@ def __init__(self, base_model: nn.Module,
                           'Please consider specifying it as a positional argument, or use `evaluator`.', DeprecationWarning)
             evaluator = trainer
 
-        if evaluator is None:
+        # base_model is None means the experiment is in resume or view mode
+        if base_model is not None and evaluator is None:
             raise ValueError('Evaluator should not be none.')
 
         self.base_model = base_model
@@ -204,18 +209,11 @@ def __init__(self, base_model: nn.Module,
                           'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL,
                           RuntimeWarning)
 
-    def _run_strategy(self, config: RetiariiExeConfig):
-        base_model_ir, self.applied_mutators = preprocess_model(
-            self.base_model, self.evaluator, self.applied_mutators,
-            full_ir=not isinstance(config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)),
-            dummy_input=config.execution_engine.dummy_input
-                if isinstance(config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None
-        )
-
+    def _run_strategy(self, base_model_ir: Model, applied_mutators: List[Mutator]) -> None:
         _logger.info('Start strategy...')
-        search_space = dry_run_for_formatted_search_space(base_model_ir, self.applied_mutators)
+        search_space = dry_run_for_formatted_search_space(base_model_ir, applied_mutators)
         self.update_search_space(search_space)
-        self.strategy.run(base_model_ir, self.applied_mutators)
+        self.strategy.run(base_model_ir, applied_mutators)
         _logger.info('Strategy exit')
         # TODO: find out a proper way to show no more trial message on WebUI
 
@@ -223,6 +221,29 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None:
         engine = init_execution_engine(config, self.port, self.url_prefix)
         set_execution_engine(engine)
 
+    def _save_experiment_checkpoint(self,
+                                    base_model_ir,
+                                    applied_mutators,
+                                    strategy) -> None:
+        ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint')
+        with open(os.path.join(ckp_path, 'nas_model'), 'w') as fp:
+            dump(base_model_ir._dump(), fp, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024)))
+        with open(os.path.join(ckp_path, 'applied_mutators'), 'w') as fp:
+            dump(applied_mutators, fp)
+        with open(os.path.join(ckp_path, 'strategy'), 'w') as fp:
+            dump(strategy, fp)
+
+    def _load_experiment_checkpoint(self):
+        ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint')
+        with open(os.path.join(ckp_path, 'nas_model'), 'r') as fp:
+            base_model_ir = load(fp=fp)
+            base_model_ir = Model._load(base_model_ir)
+        with open(os.path.join(ckp_path, 'applied_mutators'), 'r') as fp:
+            applied_mutators = load(fp=fp)
+        with open(os.path.join(ckp_path, 'strategy'), 'r') as fp:
+            strategy = load(fp=fp)
+        return base_model_ir, applied_mutators, strategy
+
     def start(self, *args, **kwargs) -> None:
         """
         By design, the only different between `start` and `run` is that `start` is asynchronous,
@@ -271,7 +292,20 @@ def run(self,
             # FIXME: engine cannot be created twice
             self._create_execution_engine(canonicalized_config)
             try:
-                self._run_strategy(canonicalized_config)
+                if self._action == 'create':
+                    base_model_ir, self.applied_mutators = preprocess_model(
+                        self.base_model, self.evaluator, self.applied_mutators,
+                        full_ir=not isinstance(canonicalized_config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)),
+                        dummy_input=canonicalized_config.execution_engine.dummy_input
+                            if isinstance(canonicalized_config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None
+                    )
+                    self._save_experiment_checkpoint(base_model_ir, self.applied_mutators, self.strategy)
+                elif self._action == 'resume':
+                    base_model_ir, self.applied_mutators, self.strategy = self._load_experiment_checkpoint()
+                else:
+                    raise RuntimeError(f'The experiment mode "{self._action}" is not supposed to invoke run() method.')
+
+                self._run_strategy(base_model_ir, self.applied_mutators)
                 # FIXME: move this logic to strategy with a new API provided by execution engine
                 self._wait_completion()
             except KeyboardInterrupt:
@@ -335,7 +369,36 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for
             elif formatter == 'dict':
                 return [get_mutation_dict(model) for model in all_models[:top_k]]
 
-    def resume(self, experiment_id: str, port: int = 8080, wait_completion: bool = True, debug: bool = False):
+    @staticmethod
+    def view(experiment_id: str, port: int = 8080, non_blocking: bool = False):
+        """
+        View a stopped experiment.
+
+        Parameters
+        ----------
+        experiment_id
+            The stopped experiment id.
+        port
+            The port of web UI.
+        non_blocking
+            If false, run in the foreground. If true, run in the background.
+        """
+        experiment = RetiariiExperiment._view(experiment_id)
+        # view is nothing specific about RetiariiExperiment, directly using the method in base experiment class
+        super(RetiariiExperiment, experiment).start(port=port, debug=False, run_mode=RunMode.Detach)
+        if non_blocking:
+            return experiment
+        else:
+            try:
+                while True:
+                    time.sleep(10)
+            except KeyboardInterrupt:
+                _logger.warning('KeyboardInterrupt detected')
+            finally:
+                experiment.stop()
+
+    @staticmethod
+    def resume(experiment_id: str, port: int = 8080, wait_completion: bool = True, debug: bool = False):
         """
         Resume a stopped experiment.
 
@@ -350,7 +413,24 @@ def resume(self, experiment_id: str, port: int = 8080, wait_completion: bool = T
         debug
             Whether to start in debug mode.
         """
-        self.id = experiment_id
-        self._action = 'resume'
-        config = launcher.get_stopped_experiment_config(experiment_id, None)
-        self.run(config, port=port, debug=debug)
\ No newline at end of file
+        experiment = RetiariiExperiment._resume(experiment_id)
+        experiment.run(experiment.config, port=port, debug=debug)
+        # always return experiment for user's follow-up operations on the experiment
+        # wait_completion is not necessary as nas experiment is always in foreground
+        return experiment
+
+    @staticmethod
+    def _resume(exp_id, exp_dir=None):
+        exp = RetiariiExperiment(None)
+        exp.id = exp_id
+        exp._action = 'resume'
+        exp.config = launcher.get_stopped_experiment_config(exp_id, exp_dir)
+        return exp
+
+    @staticmethod
+    def _view(exp_id, exp_dir=None):
+        exp = RetiariiExperiment(None)
+        exp.id = exp_id
+        exp._action = 'view'
+        exp.config = launcher.get_stopped_experiment_config(exp_id, exp_dir)
+        return exp
\ No newline at end of file
diff --git a/nni/retiarii/graph.py b/nni/retiarii/graph.py
index 0a99f81f18..fc988f45eb 100644
--- a/nni/retiarii/graph.py
+++ b/nni/retiarii/graph.py
@@ -13,6 +13,7 @@
 from typing import (TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
                     Optional, Set, Tuple, Type, Union, cast, overload)
 
+from nni.common import dump, load
 if TYPE_CHECKING:
     from .mutator import Mutator
 
@@ -172,14 +173,21 @@ def fork(self) -> 'Model':
     def _load(ir: Any) -> 'Model':
         model = Model(_internal=True)
         for graph_name, graph_data in ir.items():
-            if graph_name != '_evaluator':
+            if graph_name not in ['_evaluator', 'model_id', 'python_class', 'python_init_params']:
                 Graph._load(model, graph_name, graph_data)._register()
+        model.model_id = ir['model_id']
+        model.python_class = ir['python_class']
+        model.python_init_params = ir['python_init_params']
         if '_evaluator' in ir:
             model.evaluator = Evaluator._load(ir['_evaluator'])
         return model
 
     def _dump(self) -> Any:
         ret = {name: graph._dump() for name, graph in self.graphs.items()}
+        # NOTE: only dump some necessary member variable, will be refactored
+        ret['model_id'] = self.model_id
+        ret['python_class'] = self.python_class
+        ret['python_init_params'] = self.python_init_params
         if self.evaluator is not None:
             ret['_evaluator'] = self.evaluator._dump()
         return ret
diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py
index af10717654..20fe346ebc 100644
--- a/nni/tools/nnictl/launcher.py
+++ b/nni/tools/nnictl/launcher.py
@@ -11,6 +11,7 @@
 
 from nni.experiment import Experiment, RunMode
 from nni.experiment.config import ExperimentConfig, convert, utils
+from nni.retiarii.experiment.pytorch import RetiariiExperiment
 from nni.tools.annotation import expand_annotations, generate_search_space
 
 # used for v1-only legacy setup, remove them later
@@ -104,9 +105,13 @@ def resume_experiment(args):
         legacy_launcher.resume_experiment(args)
         exit()
 
-    exp = Experiment._resume(exp_id, exp_dir)
-    run_mode = RunMode.Foreground if foreground else RunMode.Detach
-    exp.start(port, debug, run_mode)
+    exp_class = utils.get_experiment_class_using_config(config_json)
+    if exp_class is RetiariiExperiment:
+        RetiariiExperiment.resume(exp_id, port, True, debug)
+    else:
+        exp = Experiment._resume(exp_id, exp_dir)
+        run_mode = RunMode.Foreground if foreground else RunMode.Detach
+        exp.start(port, debug, run_mode)
 
 def view_experiment(args):
     exp_id = args.id
@@ -118,5 +123,9 @@ def view_experiment(args):
         legacy_launcher.view_experiment(args)
         exit()
 
-    exp = Experiment._view(exp_id, exp_dir)
-    exp.start(port, run_mode=RunMode.Detach)
+    exp_class = utils.get_experiment_class_using_config(config_json)
+    if exp_class is RetiariiExperiment:
+        RetiariiExperiment.view(exp_id, port, non_blocking=True)
+    else:
+        exp = Experiment._view(exp_id, exp_dir)
+        exp.start(port, run_mode=RunMode.Detach)

From 811ade74cc01fbe1db1ee221fbc0220206215e67 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 11 Jul 2022 10:54:07 +0800
Subject: [PATCH 50/77] fix pylint

---
 nni/retiarii/experiment/pytorch.py | 11 +++++------
 nni/retiarii/graph.py              |  1 -
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index 93ed0c649c..dbf5eb62b3 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -9,7 +9,7 @@
 import time
 import warnings
 from threading import Thread
-from typing import Any, List, Union, cast
+from typing import Any, List, Union, cast, Tuple
 
 import colorama
 
@@ -38,7 +38,6 @@
 from ..serializer import is_model_wrapped
 from ..strategy import BaseStrategy
 from ..strategy.utils import dry_run_for_formatted_search_space
-from nni.retiarii import strategy
 
 _logger = logging.getLogger(__name__)
 
@@ -222,9 +221,9 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None:
         set_execution_engine(engine)
 
     def _save_experiment_checkpoint(self,
-                                    base_model_ir,
-                                    applied_mutators,
-                                    strategy) -> None:
+                                    base_model_ir: Model,
+                                    applied_mutators: List[Mutator],
+                                    strategy: BaseStrategy) -> None:
         ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint')
         with open(os.path.join(ckp_path, 'nas_model'), 'w') as fp:
             dump(base_model_ir._dump(), fp, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024)))
@@ -233,7 +232,7 @@ def _save_experiment_checkpoint(self,
         with open(os.path.join(ckp_path, 'strategy'), 'w') as fp:
             dump(strategy, fp)
 
-    def _load_experiment_checkpoint(self):
+    def _load_experiment_checkpoint(self) -> Tuple[Model, List[Mutator], BaseStrategy]:
         ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint')
         with open(os.path.join(ckp_path, 'nas_model'), 'r') as fp:
             base_model_ir = load(fp=fp)
diff --git a/nni/retiarii/graph.py b/nni/retiarii/graph.py
index fc988f45eb..df542c99f5 100644
--- a/nni/retiarii/graph.py
+++ b/nni/retiarii/graph.py
@@ -13,7 +13,6 @@
 from typing import (TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
                     Optional, Set, Tuple, Type, Union, cast, overload)
 
-from nni.common import dump, load
 if TYPE_CHECKING:
     from .mutator import Mutator
 

From 7b2d042e648473314ba52f81dc07966107dcb661 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 11 Jul 2022 15:39:26 +0800
Subject: [PATCH 51/77] fix pyright

---
 nni/experiment/config/utils/internal.py | 6 +++---
 nni/retiarii/experiment/pytorch.py      | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py
index 4e6e11934c..a17befe947 100644
--- a/nni/experiment/config/utils/internal.py
+++ b/nni/experiment/config/utils/internal.py
@@ -34,7 +34,9 @@
 from .public import is_missing
 
 if typing.TYPE_CHECKING:
+    from nni.retiarii.experiment.config.experiment_config import RetiariiExeConfig
     from ..base import ConfigBase
+    from ..experiment_config import ExperimentConfig
     from ..training_service import TrainingServiceConfig
 
 ## handle relative path ##
@@ -198,9 +200,7 @@ def get_ipv4_address() -> str:
     s.close()
     return addr
 
-def init_experiment_config(config_json) -> ConfigBase:
-    from ..experiment_config import ExperimentConfig
-    from nni.retiarii.experiment.config.experiment_config import RetiariiExeConfig
+def init_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]:
     if 'experimentType' in config_json:
         if config_json['experimentType'] == 'hpo':
             return ExperimentConfig(**config_json)
diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py
index dbf5eb62b3..f9586cd1d5 100644
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -224,7 +224,8 @@ def _save_experiment_checkpoint(self,
                                     base_model_ir: Model,
                                     applied_mutators: List[Mutator],
                                     strategy: BaseStrategy) -> None:
-        ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint')
+        ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory),
+                                self.id, 'checkpoint')
         with open(os.path.join(ckp_path, 'nas_model'), 'w') as fp:
             dump(base_model_ir._dump(), fp, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024)))
         with open(os.path.join(ckp_path, 'applied_mutators'), 'w') as fp:
@@ -233,7 +234,8 @@ def _save_experiment_checkpoint(self,
             dump(strategy, fp)
 
     def _load_experiment_checkpoint(self) -> Tuple[Model, List[Mutator], BaseStrategy]:
-        ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint')
+        ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory),
+                                self.id, 'checkpoint')
         with open(os.path.join(ckp_path, 'nas_model'), 'r') as fp:
             base_model_ir = load(fp=fp)
             base_model_ir = Model._load(base_model_ir)

From 36103d821d0cf33be11a3790c2c37f8d4d0205a5 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Sat, 6 Aug 2022 16:09:38 +0800
Subject: [PATCH 52/77] update

---
 nni/nas/experiment/pytorch.py | 9 +++++----
 nni/tools/nnictl/launcher.py  | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py
index 77a86365c9..c5f62afb41 100644
--- a/nni/nas/experiment/pytorch.py
+++ b/nni/nas/experiment/pytorch.py
@@ -18,7 +18,6 @@
 import torch.nn as nn
 from nni.common import dump, load
 from nni.experiment import Experiment, RunMode, launcher
-from nni.experiment.config.training_services import RemoteConfig
 
 from nni.nas.execution import list_models, set_execution_engine
 from nni.nas.execution.common import RetiariiAdvisor, get_mutation_dict, init_execution_engine, Model
@@ -173,10 +172,10 @@ class RetiariiExperiment(Experiment):
     """
 
     def __init__(self, base_model: nn.Module,
-                 evaluator: Union[BaseOneShotTrainer, Evaluator] = cast(Evaluator, None),
+                 evaluator: Evaluator = cast(Evaluator, None),
                  applied_mutators: List[Mutator] = cast(List[Mutator], None),
                  strategy: BaseStrategy = cast(BaseStrategy, None),
-                 trainer: BaseOneShotTrainer = cast(BaseOneShotTrainer, None)):
+                 trainer: Any = None):
         super().__init__(None)
         self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None)
 
@@ -190,7 +189,7 @@ def __init__(self, base_model: nn.Module,
             raise ValueError('Evaluator should not be none.')
 
         self.base_model = base_model
-        self.evaluator: Union[Evaluator, BaseOneShotTrainer] = evaluator
+        self.evaluator: Evaluator = evaluator
         self.applied_mutators = applied_mutators
         self.strategy = strategy
 
@@ -257,6 +256,7 @@ def run(self,
         Run the experiment.
         This function will block until experiment finish or error.
         """
+        from nni.retiarii.oneshot.interface import BaseOneShotTrainer
         if isinstance(self.evaluator, BaseOneShotTrainer):
             # TODO: will throw a deprecation warning soon
             # warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. '
@@ -349,6 +349,7 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for
             config = self.config.canonical_copy()
             assert not isinstance(config.execution_engine, PyEngineConfig), \
                 'You should use `dict` formatter when using Python execution engine.'
+        from nni.retiarii.oneshot.interface import BaseOneShotTrainer
         if isinstance(self.evaluator, BaseOneShotTrainer):
             assert top_k == 1, 'Only support top_k is 1 for now.'
             return self.evaluator.export()
diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py
index 20fe346ebc..b718434cd9 100644
--- a/nni/tools/nnictl/launcher.py
+++ b/nni/tools/nnictl/launcher.py
@@ -11,7 +11,7 @@
 
 from nni.experiment import Experiment, RunMode
 from nni.experiment.config import ExperimentConfig, convert, utils
-from nni.retiarii.experiment.pytorch import RetiariiExperiment
+from nni.nas.experiment.pytorch import RetiariiExperiment
 from nni.tools.annotation import expand_annotations, generate_search_space
 
 # used for v1-only legacy setup, remove them later

From 12334d6e53cfc0a8be9e25fb84a25b2d7c7f0720 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Sat, 6 Aug 2022 17:08:59 +0800
Subject: [PATCH 53/77] resolve comments

---
 nni/experiment/config/experiment_config.py     |  3 ++-
 nni/experiment/config/utils/internal.py        | 11 +++++++----
 nni/experiment/launcher.py                     |  4 ++--
 nni/nas/execution/common/utils.py              | 10 +++++-----
 nni/nas/experiment/config/experiment_config.py |  3 ++-
 nni/nas/experiment/pytorch.py                  |  2 +-
 6 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/nni/experiment/config/experiment_config.py b/nni/experiment/config/experiment_config.py
index e5124832dd..1d301c1515 100644
--- a/nni/experiment/config/experiment_config.py
+++ b/nni/experiment/config/experiment_config.py
@@ -12,6 +12,7 @@
 import logging
 from pathlib import Path
 from typing import Any, List, Optional, Union
+from typing_extensions import Literal
 
 import yaml
 
@@ -61,7 +62,7 @@ class ExperimentConfig(ConfigBase):
     # In latter case hybrid training services can have different settings.
 
     experiment_name: Optional[str] = None
-    experiment_type: str = 'hpo'
+    experiment_type: Literal['hpo'] = 'hpo'
     search_space_file: Optional[utils.PathLike] = None
     search_space: Any = None
     trial_command: Optional[str] = None  # training service field
diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py
index 4cea82e01e..6edaac4cfd 100644
--- a/nni/experiment/config/utils/internal.py
+++ b/nni/experiment/config/utils/internal.py
@@ -15,7 +15,8 @@
     'fields', 'is_instance', 'validate_type', 'is_path_like',
     'guess_config_type', 'guess_list_config_type',
     'training_service_config_factory', 'load_training_service_config',
-    'get_ipv4_address', 'init_experiment_config', 'get_experiment_class_using_config'
+    'load_experiment_config', 'get_experiment_class_using_config',
+    'get_ipv4_address'
 ]
 
 import copy
@@ -34,7 +35,7 @@
 from .public import is_missing
 
 if typing.TYPE_CHECKING:
-    from nni.retiarii.experiment.config.experiment_config import RetiariiExeConfig
+    from nni.nas.experiment.config import RetiariiExeConfig
     from ..base import ConfigBase
     from ..experiment_config import ExperimentConfig
     from ..training_service import TrainingServiceConfig
@@ -201,7 +202,9 @@ def get_ipv4_address() -> str:
     s.close()
     return addr
 
-def init_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]:
+def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]:
+    from nni.nas.experiment.config import RetiariiExeConfig
+    from ..experiment_config import ExperimentConfig
     if 'experimentType' in config_json:
         if config_json['experimentType'] == 'hpo':
             return ExperimentConfig(**config_json)
@@ -218,7 +221,7 @@ def init_experiment_config(config_json) -> typing.Union[ExperimentConfig, Retiar
 
 def get_experiment_class_using_config(config_json):
     from ...experiment import Experiment
-    from nni.retiarii.experiment.pytorch import RetiariiExperiment
+    from nni.nas.experiment.pytorch import RetiariiExperiment
     if 'experimentType' in config_json:
         if config_json['experimentType'] == 'hpo':
             return Experiment
diff --git a/nni/experiment/launcher.py b/nni/experiment/launcher.py
index 9c50bd92e3..64b99feb46 100644
--- a/nni/experiment/launcher.py
+++ b/nni/experiment/launcher.py
@@ -18,7 +18,7 @@
 from typing_extensions import Literal
 
 from .config import ExperimentConfig
-from .config.utils import init_experiment_config
+from .config.utils import load_experiment_config
 from . import rest
 from ..tools.nnictl.config_utils import Experiments, Config
 from ..tools.nnictl.nnictl_utils import update_experiment
@@ -204,7 +204,7 @@ def _save_experiment_information(experiment_id: str, port: int, start_time: int,
 
 def get_stopped_experiment_config(exp_id, exp_dir=None):
     config_json = get_stopped_experiment_config_json(exp_id, exp_dir)  # type: ignore
-    config = init_experiment_config(config_json)  # type: ignore
+    config = load_experiment_config(config_json)  # type: ignore
     if exp_dir and not os.path.samefile(exp_dir, config.experiment_working_directory):
         msg = 'Experiment working directory provided in command line (%s) is different from experiment config (%s)'
         _logger.warning(msg, exp_dir, config.experiment_working_directory)
diff --git a/nni/nas/execution/common/utils.py b/nni/nas/execution/common/utils.py
index 553abe580f..e92f3e61ac 100644
--- a/nni/nas/execution/common/utils.py
+++ b/nni/nas/execution/common/utils.py
@@ -1,16 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-__all__ = ['unpack_if_only_one', 'get_mutation_dict', 'mutation_dict_to_summary', 'get_mutation_summary']
+__all__ = ['unpack_if_only_one', 'get_mutation_dict', 'mutation_dict_to_summary', 'get_mutation_summary', 'init_execution_engine']
 
 from typing import Any, List, cast
 from nni.experiment.config.training_services import RemoteConfig
 from .engine import AbstractExecutionEngine
 from .graph import Model
-from ...experiment.config import (
-    BaseEngineConfig, PyEngineConfig,
-    CgoEngineConfig, BenchmarkEngineConfig
-)
 
 
 def unpack_if_only_one(ele: List[Any]):
@@ -40,6 +36,10 @@ def get_mutation_summary(model: Model) -> dict:
     return mutation_dict_to_summary(mutation)
 
 def init_execution_engine(config, port, url_prefix) -> AbstractExecutionEngine:
+    from ...experiment.config import (
+        BaseEngineConfig, PyEngineConfig,
+        CgoEngineConfig, BenchmarkEngineConfig
+    )
     if isinstance(config.execution_engine, BaseEngineConfig):
         from ..pytorch.graph import BaseExecutionEngine
         return BaseExecutionEngine(port, url_prefix)
diff --git a/nni/nas/experiment/config/experiment_config.py b/nni/nas/experiment/config/experiment_config.py
index af1d4c824c..61f24ba759 100644
--- a/nni/nas/experiment/config/experiment_config.py
+++ b/nni/nas/experiment/config/experiment_config.py
@@ -5,6 +5,7 @@
 import sys
 from dataclasses import dataclass, MISSING
 from typing import Any, Dict, Union, Optional
+from typing_extensions import Literal
 
 from nni.experiment.config import utils, ExperimentConfig
 
@@ -36,7 +37,7 @@ def _get_ee_config_class(engine_name):
 @dataclass(init=False)
 class RetiariiExeConfig(ExperimentConfig):
     # FIXME: refactor this class to inherit from a new common base class with HPO config
-    experiment_type: str = 'nas'
+    experiment_type: Literal['nas'] = 'nas'
     search_space: Any = ''
     trial_code_directory: utils.PathLike = '.'
     trial_command: str = '_reserved'
diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py
index c5f62afb41..b85e3265fc 100644
--- a/nni/nas/experiment/pytorch.py
+++ b/nni/nas/experiment/pytorch.py
@@ -10,7 +10,7 @@
 import time
 import warnings
 from threading import Thread
-from typing import Any, List, cast, Union, Tuple
+from typing import Any, List, cast, Tuple
 
 import colorama
 

From 49b1dc4668ece4ff0981629bc68500f14c4552b3 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Sat, 6 Aug 2022 17:33:24 +0800
Subject: [PATCH 54/77] minor

---
 nni/experiment/config/utils/internal.py | 9 +++++----
 nni/nas/execution/common/integration.py | 2 +-
 nni/tools/nnictl/launcher.py            | 4 ++--
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py
index 6edaac4cfd..e7b6c80a47 100644
--- a/nni/experiment/config/utils/internal.py
+++ b/nni/experiment/config/utils/internal.py
@@ -15,7 +15,7 @@
     'fields', 'is_instance', 'validate_type', 'is_path_like',
     'guess_config_type', 'guess_list_config_type',
     'training_service_config_factory', 'load_training_service_config',
-    'load_experiment_config', 'get_experiment_class_using_config',
+    'load_experiment_config', 'get_experiment_cls_using_config',
     'get_ipv4_address'
 ]
 
@@ -203,6 +203,7 @@ def get_ipv4_address() -> str:
     return addr
 
 def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]:
+    # avoid circular import
     from nni.nas.experiment.config import RetiariiExeConfig
     from ..experiment_config import ExperimentConfig
     if 'experimentType' in config_json:
@@ -219,9 +220,9 @@ def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, Retiar
         else:
             return ExperimentConfig(**config_json)
 
-def get_experiment_class_using_config(config_json):
-    from ...experiment import Experiment
+def get_experiment_cls_using_config(config_json):
     from nni.nas.experiment.pytorch import RetiariiExperiment
+    from ...experiment import Experiment
     if 'experimentType' in config_json:
         if config_json['experimentType'] == 'hpo':
             return Experiment
@@ -233,4 +234,4 @@ def get_experiment_class_using_config(config_json):
         if 'executionEngine' in config_json:
             return RetiariiExperiment
         else:
-            return Experiment
\ No newline at end of file
+            return Experiment
diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py
index 4c56c390dd..ce04b79f5a 100644
--- a/nni/nas/execution/common/integration.py
+++ b/nni/nas/execution/common/integration.py
@@ -236,4 +236,4 @@ def _process_value(value) -> Any:  # hopefully a float
 
     def handle_import_data(self, data):
         # FIXME: ignore imported data for now, as strategy has not supported resume
-        pass
\ No newline at end of file
+        pass
diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py
index b718434cd9..8e7fc33f62 100644
--- a/nni/tools/nnictl/launcher.py
+++ b/nni/tools/nnictl/launcher.py
@@ -105,7 +105,7 @@ def resume_experiment(args):
         legacy_launcher.resume_experiment(args)
         exit()
 
-    exp_class = utils.get_experiment_class_using_config(config_json)
+    exp_class = utils.get_experiment_cls_using_config(config_json)
     if exp_class is RetiariiExperiment:
         RetiariiExperiment.resume(exp_id, port, True, debug)
     else:
@@ -123,7 +123,7 @@ def view_experiment(args):
         legacy_launcher.view_experiment(args)
         exit()
 
-    exp_class = utils.get_experiment_class_using_config(config_json)
+    exp_class = utils.get_experiment_cls_using_config(config_json)
     if exp_class is RetiariiExperiment:
         RetiariiExperiment.view(exp_id, port, non_blocking=True)
     else:

From adf87a855e405c7e8bf67e9b2975673d553f67a5 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 9 Aug 2022 13:57:50 +0800
Subject: [PATCH 55/77] add ut

---
 nni/nas/execution/common/utils.py             |  2 +-
 .../experiment/config/experiment_config.py    |  2 +-
 nni/nas/experiment/pytorch.py                 | 24 +++++------
 test/ut/nas/test_experiment.py                | 42 +++++++++++++++++++
 4 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/nni/nas/execution/common/utils.py b/nni/nas/execution/common/utils.py
index e92f3e61ac..96693f3564 100644
--- a/nni/nas/execution/common/utils.py
+++ b/nni/nas/execution/common/utils.py
@@ -65,4 +65,4 @@ def init_execution_engine(config, port, url_prefix) -> AbstractExecutionEngine:
             '"benchmark" must be set when benchmark execution engine is used.'
         return BenchmarkExecutionEngine(config.execution_engine.benchmark)
     else:
-        raise ValueError(f'Unsupported engine type: {config.execution_engine}')
\ No newline at end of file
+        raise ValueError(f'Unsupported engine type: {config.execution_engine}')
diff --git a/nni/nas/experiment/config/experiment_config.py b/nni/nas/experiment/config/experiment_config.py
index 61f24ba759..83f3d6c1c4 100644
--- a/nni/nas/experiment/config/experiment_config.py
+++ b/nni/nas/experiment/config/experiment_config.py
@@ -53,7 +53,7 @@ def __init__(self, training_service_platform: Union[str, None] = None,
                  **kwargs):
         super().__init__(training_service_platform, **kwargs)
 
-        if self.execution_engine != MISSING:
+        if not utils.is_missing(self.execution_engine):
             # this branch means kwargs is not {} and self.execution_engine has been assigned in super(),
             # reassign it because super() may instantiate ExecutionEngineConfig by mistake
             self.execution_engine = init_execution_engine_config(kwargs['executionEngine'])
diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py
index b85e3265fc..d6c4b64cd7 100644
--- a/nni/nas/experiment/pytorch.py
+++ b/nni/nas/experiment/pytorch.py
@@ -171,7 +171,7 @@ class RetiariiExperiment(Experiment):
     ...     final_model = Net()
     """
 
-    def __init__(self, base_model: nn.Module,
+    def __init__(self, base_model: nn.Module = cast(nn.Module, None),
                  evaluator: Evaluator = cast(Evaluator, None),
                  applied_mutators: List[Mutator] = cast(List[Mutator], None),
                  strategy: BaseStrategy = cast(BaseStrategy, None),
@@ -185,8 +185,15 @@ def __init__(self, base_model: nn.Module,
             evaluator = trainer
 
         # base_model is None means the experiment is in resume or view mode
-        if base_model is not None and evaluator is None:
-            raise ValueError('Evaluator should not be none.')
+        if base_model is not None:
+            if evaluator is None:
+                raise ValueError('Evaluator should not be none.')
+            # check for sanity
+            if not is_model_wrapped(base_model):
+                warnings.warn(colorama.Style.BRIGHT + colorama.Fore.RED +
+                            '`@model_wrapper` is missing for the base model. The experiment might still be able to run, '
+                            'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL,
+                            RuntimeWarning)
 
         self.base_model = base_model
         self.evaluator: Evaluator = evaluator
@@ -196,13 +203,6 @@ def __init__(self, base_model: nn.Module,
         self._dispatcher = None
         self._dispatcher_thread = None
 
-        # check for sanity
-        if not is_model_wrapped(base_model):
-            warnings.warn(colorama.Style.BRIGHT + colorama.Fore.RED +
-                          '`@model_wrapper` is missing for the base model. The experiment might still be able to run, '
-                          'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL,
-                          RuntimeWarning)
-
     def _run_strategy(self, base_model_ir: Model, applied_mutators: List[Mutator]) -> None:
         _logger.info('Start strategy...')
         search_space = dry_run_for_formatted_search_space(base_model_ir, applied_mutators)
@@ -422,7 +422,7 @@ def _resume(exp_id, exp_dir=None):
         exp = RetiariiExperiment(None)
         exp.id = exp_id
         exp._action = 'resume'
-        exp.config = launcher.get_stopped_experiment_config(exp_id, exp_dir)
+        exp.config = cast(RetiariiExeConfig, launcher.get_stopped_experiment_config(exp_id, exp_dir))
         return exp
 
     @staticmethod
@@ -430,5 +430,5 @@ def _view(exp_id, exp_dir=None):
         exp = RetiariiExperiment(None)
         exp.id = exp_id
         exp._action = 'view'
-        exp.config = launcher.get_stopped_experiment_config(exp_id, exp_dir)
+        exp.config = cast(RetiariiExeConfig, launcher.get_stopped_experiment_config(exp_id, exp_dir))
         return exp
\ No newline at end of file
diff --git a/test/ut/nas/test_experiment.py b/test/ut/nas/test_experiment.py
index 991e3b23f6..28ea8edc00 100644
--- a/test/ut/nas/test_experiment.py
+++ b/test/ut/nas/test_experiment.py
@@ -1,4 +1,6 @@
+from asyncio import subprocess
 import os
+import subprocess
 import sys
 
 import nni
@@ -112,6 +114,46 @@ def test_multitrial_experiment(pytestconfig):
     assert isinstance(exp.export_top_models()[0], dict)
     exp.stop()
 
+def test_multitrial_experiment_resume_view(pytestconfig):
+    # start a normal nas experiment
+    base_model = Net()
+    evaluator = get_mnist_evaluator()
+    search_strategy = strategy.Random()
+    exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy)
+    exp_id = exp.id
+    exp_config = RetiariiExeConfig('local')
+    exp_config.trial_concurrency = 1
+    exp_config.max_trial_number = 1
+    exp_config._trial_command_params = nas_experiment_trial_params(pytestconfig.rootpath)
+    exp.run(exp_config)
+    ensure_success(exp)
+    assert isinstance(exp.export_top_models()[0], dict)
+    exp.stop()
+
+    # resume the above nas experiment. only tested the resume logic in the python side,
+    # as no more trial is executed after resume, the above experiment is already finished
+    print('python api resume...')
+    exp = RetiariiExperiment.resume(exp_id)
+    ensure_success(exp)
+    # TODO: currently `export_top_models` does not work as strategy's states are not resumed
+    # assert isinstance(exp.export_top_models()[0], dict)
+    exp.stop()
+    # view the above experiment in non blocking mode then stop it
+    print('python api view...')
+    exp = RetiariiExperiment.view(exp_id, non_blocking=True)
+    exp.stop()
+
+    # the following is nnictl resume and view
+    print('nnictl resume...')
+    new_env = os.environ.copy()
+    new_env['PYTHONPATH'] = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
+    proc = subprocess.run(f'nnictl resume {exp_id}', shell=True, env=new_env)
+    assert proc.returncode == 0, 'resume nas experiment failed with code %d' % proc.returncode
+    print('nnictl view...')
+    proc = subprocess.run(f'nnictl view {exp_id}', shell=True)
+    assert proc.returncode == 0, 'view nas experiment failed with code %d' % proc.returncode
+    proc = subprocess.run(f'nnictl stop {exp_id}', shell=True)
+    assert proc.returncode == 0, 'stop viewed nas experiment failed with code %d' % proc.returncode
 
 def test_oneshot_experiment():
     base_model = Net()

From 6ecfd3ac5d4acbf6d73f8b2ddd837e9b5076fa38 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 9 Aug 2022 13:59:35 +0800
Subject: [PATCH 56/77] minor

---
 nni/nas/experiment/config/experiment_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nni/nas/experiment/config/experiment_config.py b/nni/nas/experiment/config/experiment_config.py
index 83f3d6c1c4..7a39357b1b 100644
--- a/nni/nas/experiment/config/experiment_config.py
+++ b/nni/nas/experiment/config/experiment_config.py
@@ -3,7 +3,7 @@
 
 import os
 import sys
-from dataclasses import dataclass, MISSING
+from dataclasses import dataclass
 from typing import Any, Dict, Union, Optional
 from typing_extensions import Literal
 

From 8f20b2f4920a5a2a243cec42c1655f8fb602afec Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 9 Aug 2022 14:13:27 +0800
Subject: [PATCH 57/77] fix pylint

---
 nni/nas/experiment/config/experiment_config.py | 2 +-
 nni/nas/experiment/pytorch.py                  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nni/nas/experiment/config/experiment_config.py b/nni/nas/experiment/config/experiment_config.py
index 7a39357b1b..de55158c45 100644
--- a/nni/nas/experiment/config/experiment_config.py
+++ b/nni/nas/experiment/config/experiment_config.py
@@ -99,4 +99,4 @@ def _canonicalize(self, _parents):
 
             self.trial_command = trial_command_tmpl.format(**_trial_command_params).strip()
 
-        super()._canonicalize([self])
\ No newline at end of file
+        super()._canonicalize([self])
diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py
index d6c4b64cd7..d7baa71891 100644
--- a/nni/nas/experiment/pytorch.py
+++ b/nni/nas/experiment/pytorch.py
@@ -419,7 +419,7 @@ def resume(experiment_id: str, port: int = 8080, wait_completion: bool = True, d
 
     @staticmethod
     def _resume(exp_id, exp_dir=None):
-        exp = RetiariiExperiment(None)
+        exp = RetiariiExperiment(cast(nn.Module, None))
         exp.id = exp_id
         exp._action = 'resume'
         exp.config = cast(RetiariiExeConfig, launcher.get_stopped_experiment_config(exp_id, exp_dir))
@@ -427,8 +427,8 @@ def _resume(exp_id, exp_dir=None):
 
     @staticmethod
     def _view(exp_id, exp_dir=None):
-        exp = RetiariiExperiment(None)
+        exp = RetiariiExperiment(cast(nn.Module, None))
         exp.id = exp_id
         exp._action = 'view'
         exp.config = cast(RetiariiExeConfig, launcher.get_stopped_experiment_config(exp_id, exp_dir))
-        return exp
\ No newline at end of file
+        return exp

From ece0771ef6d1be6c5e6fd58c7458299734ee099a Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 9 Aug 2022 15:02:22 +0800
Subject: [PATCH 58/77] fix ut

---
 nni/nas/execution/common/graph.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/nni/nas/execution/common/graph.py b/nni/nas/execution/common/graph.py
index c413a07959..b1d7f64acc 100644
--- a/nni/nas/execution/common/graph.py
+++ b/nni/nas/execution/common/graph.py
@@ -126,9 +126,10 @@ def _load(ir: Any) -> 'Model':
         for graph_name, graph_data in ir.items():
             if graph_name not in ['_evaluator', 'model_id', 'python_class', 'python_init_params']:
                 Graph._load(model, graph_name, graph_data)._register()
-        model.model_id = ir['model_id']
-        model.python_class = ir['python_class']
-        model.python_init_params = ir['python_init_params']
+        if 'model_id' in ir: # backward compatibility
+            model.model_id = ir['model_id']
+            model.python_class = ir['python_class']
+            model.python_init_params = ir['python_init_params']
         if '_evaluator' in ir:
             model.evaluator = Evaluator._load(ir['_evaluator'])
         return model

From bb7180428ebd4328c08ffd83c197eaffb74f11b6 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 9 Aug 2022 15:41:17 +0800
Subject: [PATCH 59/77] fix ut

---
 nni/nas/experiment/pytorch.py         | 4 +---
 test/ut/experiment/test_exp_config.py | 2 ++
 test/ut/experiment/test_ts_remote.py  | 2 ++
 test/ut/nas/test_graph.py             | 4 ++++
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py
index d7baa71891..cec8b62945 100644
--- a/nni/nas/experiment/pytorch.py
+++ b/nni/nas/experiment/pytorch.py
@@ -396,7 +396,7 @@ def view(experiment_id: str, port: int = 8080, non_blocking: bool = False):
                 experiment.stop()
 
     @staticmethod
-    def resume(experiment_id: str, port: int = 8080, wait_completion: bool = True, debug: bool = False):
+    def resume(experiment_id: str, port: int = 8080, debug: bool = False):
         """
         Resume a stopped experiment.
 
@@ -406,8 +406,6 @@ def resume(experiment_id: str, port: int = 8080, wait_completion: bool = True, d
             The stopped experiment id.
         port
             The port of web UI.
-        wait_completion
-            If true, run in the foreground. If false, run in the background.
         debug
             Whether to start in debug mode.
         """
diff --git a/test/ut/experiment/test_exp_config.py b/test/ut/experiment/test_exp_config.py
index 972f8076e5..5048a4322c 100644
--- a/test/ut/experiment/test_exp_config.py
+++ b/test/ut/experiment/test_exp_config.py
@@ -28,6 +28,7 @@ def expand_path(path):
 minimal_class.tuner.name = 'random'
 
 minimal_canon = {
+    'experimentType': 'hpo',
     'searchSpace': {'a': 1},
     'trialCommand': 'python main.py',
     'trialCodeDirectory': os.path.realpath('.'),
@@ -54,6 +55,7 @@ def expand_path(path):
 
 detailed_canon = {
     'experimentName': 'test case',
+    'experimentType': 'hpo',
     'searchSpaceFile': expand_path('assets/search_space.json'),
     'searchSpace': {'a': 1},
     'trialCommand': 'python main.py',
diff --git a/test/ut/experiment/test_ts_remote.py b/test/ut/experiment/test_ts_remote.py
index 770e6faac9..e7c16972e0 100644
--- a/test/ut/experiment/test_ts_remote.py
+++ b/test/ut/experiment/test_ts_remote.py
@@ -43,6 +43,7 @@
 )
 
 minimal_canon = {
+    'experimentType': 'hpo',
     'searchSpace': {'a': 1},
     'trialCommand': 'python main.py',
     'trialCodeDirectory': os.path.realpath('.'),
@@ -106,6 +107,7 @@
 }
 
 detailed_canon = {
+    'experimentType': 'hpo',
     'searchSpace': {'a': 1},
     'trialCommand': 'python main.py',
     'trialCodeDirectory': os.path.realpath('.'),
diff --git a/test/ut/nas/test_graph.py b/test/ut/nas/test_graph.py
index 69dd8c52a9..7a8a39900b 100644
--- a/test/ut/nas/test_graph.py
+++ b/test/ut/nas/test_graph.py
@@ -37,6 +37,10 @@ def _test_file(json_path):
     # skip comparison of _evaluator
     orig_ir.pop('_evaluator')
     dump_ir.pop('_evaluator')
+    # skip three experiment fields
+    dump_ir.pop('model_id')
+    dump_ir.pop('python_class')
+    dump_ir.pop('python_init_params')
 
     assert orig_ir == dump_ir
 

From 31856093541fe1dd11be811d84edb17e82bebb4c Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 9 Aug 2022 15:49:46 +0800
Subject: [PATCH 60/77] minor

---
 nni/tools/nnictl/launcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py
index 8e7fc33f62..c9f4860672 100644
--- a/nni/tools/nnictl/launcher.py
+++ b/nni/tools/nnictl/launcher.py
@@ -107,7 +107,7 @@ def resume_experiment(args):
 
     exp_class = utils.get_experiment_cls_using_config(config_json)
     if exp_class is RetiariiExperiment:
-        RetiariiExperiment.resume(exp_id, port, True, debug)
+        RetiariiExperiment.resume(exp_id, port, debug)
     else:
         exp = Experiment._resume(exp_id, exp_dir)
         run_mode = RunMode.Foreground if foreground else RunMode.Detach

From 0b4cc963867f91e79d0300d3b0805b91b43b02d6 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 9 Aug 2022 17:01:09 +0800
Subject: [PATCH 61/77] resolve comments

---
 nni/experiment/config/utils/internal.py | 22 ++++++--------
 nni/nas/execution/api.py                | 38 +++++++++++++++++++++++--
 nni/nas/execution/common/utils.py       | 38 ++-----------------------
 nni/nas/experiment/pytorch.py           |  3 +-
 4 files changed, 49 insertions(+), 52 deletions(-)

diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py
index e7b6c80a47..5cbe216a74 100644
--- a/nni/experiment/config/utils/internal.py
+++ b/nni/experiment/config/utils/internal.py
@@ -205,20 +205,16 @@ def get_ipv4_address() -> str:
 def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]:
     # avoid circular import
     from nni.nas.experiment.config import RetiariiExeConfig
+    from nni.nas.experiment.pytorch import RetiariiExperiment
     from ..experiment_config import ExperimentConfig
-    if 'experimentType' in config_json:
-        if config_json['experimentType'] == 'hpo':
-            return ExperimentConfig(**config_json)
-        elif config_json['experimentType'] == 'nas':
-            return RetiariiExeConfig(**config_json)
-        else:
-            raise KeyError(f'Unknown experiment_type: {config_json["experimentType"]}')
+    from ...experiment import Experiment
+    exp_cls = get_experiment_cls_using_config(config_json)
+    if exp_cls is Experiment:
+        return ExperimentConfig(**config_json)
+    elif exp_cls is RetiariiExperiment:
+        return RetiariiExeConfig(**config_json)
     else:
-        # for backward compatibility, experiment config <= v2.8 does not have "experiment_type"
-        if 'executionEngine' in config_json:
-            return RetiariiExeConfig(**config_json)
-        else:
-            return ExperimentConfig(**config_json)
+        raise TypeError(f'Unsupported experiment type: {type(exp_cls)}')
 
 def get_experiment_cls_using_config(config_json):
     from nni.nas.experiment.pytorch import RetiariiExperiment
@@ -229,7 +225,7 @@ def get_experiment_cls_using_config(config_json):
         elif config_json['experimentType'] == 'nas':
             return RetiariiExperiment
         else:
-            raise KeyError(f'Unknown experiment_type: {config_json["experimentType"]}')
+            raise ValueError(f'Unknown experiment_type: {config_json["experimentType"]}')
     else:
         if 'executionEngine' in config_json:
             return RetiariiExperiment
diff --git a/nni/nas/execution/api.py b/nni/nas/execution/api.py
index ef2558e758..4581bbe42f 100644
--- a/nni/nas/execution/api.py
+++ b/nni/nas/execution/api.py
@@ -3,8 +3,9 @@
 
 import time
 import warnings
-from typing import Iterable
+from typing import Iterable, cast
 
+from nni.experiment.config.training_services import RemoteConfig
 from nni.nas.execution.common import (
     Model, ModelStatus,
     AbstractExecutionEngine,
@@ -14,11 +15,44 @@
 _execution_engine = None
 _default_listener = None
 
-__all__ = ['get_execution_engine', 'get_and_register_default_listener',
+__all__ = ['init_execution_engine', 'get_execution_engine', 'get_and_register_default_listener',
            'list_models', 'submit_models', 'wait_models', 'query_available_resources',
            'set_execution_engine', 'is_stopped_exec', 'budget_exhausted']
 
 
+def init_execution_engine(config, port, url_prefix) -> AbstractExecutionEngine:
+    from ..experiment.config import (
+        BaseEngineConfig, PyEngineConfig,
+        CgoEngineConfig, BenchmarkEngineConfig
+    )
+    if isinstance(config.execution_engine, BaseEngineConfig):
+        from .pytorch.graph import BaseExecutionEngine
+        return BaseExecutionEngine(port, url_prefix)
+    elif isinstance(config.execution_engine, CgoEngineConfig):
+        from .pytorch.cgo.engine import CGOExecutionEngine
+
+        assert not isinstance(config.training_service, list) \
+            and config.training_service.platform == 'remote', \
+            "CGO execution engine currently only supports remote training service"
+        assert config.execution_engine.batch_waiting_time is not None \
+            and config.execution_engine.max_concurrency_cgo is not None
+        return CGOExecutionEngine(cast(RemoteConfig, config.training_service),
+                                    max_concurrency=config.execution_engine.max_concurrency_cgo,
+                                    batch_waiting_time=config.execution_engine.batch_waiting_time,
+                                    rest_port=port,
+                                    rest_url_prefix=url_prefix)
+    elif isinstance(config.execution_engine, PyEngineConfig):
+        from .pytorch.simplified import PurePythonExecutionEngine
+        return PurePythonExecutionEngine(port, url_prefix)
+    elif isinstance(config.execution_engine, BenchmarkEngineConfig):
+        from .pytorch.benchmark import BenchmarkExecutionEngine
+        assert config.execution_engine.benchmark is not None, \
+            '"benchmark" must be set when benchmark execution engine is used.'
+        return BenchmarkExecutionEngine(config.execution_engine.benchmark)
+    else:
+        raise ValueError(f'Unsupported engine type: {config.execution_engine}')
+
+
 def set_execution_engine(engine: AbstractExecutionEngine) -> None:
     global _execution_engine
     if _execution_engine is not None:
diff --git a/nni/nas/execution/common/utils.py b/nni/nas/execution/common/utils.py
index 96693f3564..5230ddf706 100644
--- a/nni/nas/execution/common/utils.py
+++ b/nni/nas/execution/common/utils.py
@@ -1,11 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-__all__ = ['unpack_if_only_one', 'get_mutation_dict', 'mutation_dict_to_summary', 'get_mutation_summary', 'init_execution_engine']
+__all__ = ['unpack_if_only_one', 'get_mutation_dict', 'mutation_dict_to_summary', 'get_mutation_summary']
 
-from typing import Any, List, cast
-from nni.experiment.config.training_services import RemoteConfig
-from .engine import AbstractExecutionEngine
+from typing import Any, List
 from .graph import Model
 
 
@@ -34,35 +32,3 @@ def mutation_dict_to_summary(mutation: dict) -> dict:
 def get_mutation_summary(model: Model) -> dict:
     mutation = get_mutation_dict(model)
     return mutation_dict_to_summary(mutation)
-
-def init_execution_engine(config, port, url_prefix) -> AbstractExecutionEngine:
-    from ...experiment.config import (
-        BaseEngineConfig, PyEngineConfig,
-        CgoEngineConfig, BenchmarkEngineConfig
-    )
-    if isinstance(config.execution_engine, BaseEngineConfig):
-        from ..pytorch.graph import BaseExecutionEngine
-        return BaseExecutionEngine(port, url_prefix)
-    elif isinstance(config.execution_engine, CgoEngineConfig):
-        from ..pytorch.cgo.engine import CGOExecutionEngine
-
-        assert not isinstance(config.training_service, list) \
-            and config.training_service.platform == 'remote', \
-            "CGO execution engine currently only supports remote training service"
-        assert config.execution_engine.batch_waiting_time is not None \
-            and config.execution_engine.max_concurrency_cgo is not None
-        return CGOExecutionEngine(cast(RemoteConfig, config.training_service),
-                                    max_concurrency=config.execution_engine.max_concurrency_cgo,
-                                    batch_waiting_time=config.execution_engine.batch_waiting_time,
-                                    rest_port=port,
-                                    rest_url_prefix=url_prefix)
-    elif isinstance(config.execution_engine, PyEngineConfig):
-        from ..pytorch.simplified import PurePythonExecutionEngine
-        return PurePythonExecutionEngine(port, url_prefix)
-    elif isinstance(config.execution_engine, BenchmarkEngineConfig):
-        from ..pytorch.benchmark import BenchmarkExecutionEngine
-        assert config.execution_engine.benchmark is not None, \
-            '"benchmark" must be set when benchmark execution engine is used.'
-        return BenchmarkExecutionEngine(config.execution_engine.benchmark)
-    else:
-        raise ValueError(f'Unsupported engine type: {config.execution_engine}')
diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py
index cec8b62945..7b5bde39ac 100644
--- a/nni/nas/experiment/pytorch.py
+++ b/nni/nas/experiment/pytorch.py
@@ -20,7 +20,8 @@
 from nni.experiment import Experiment, RunMode, launcher
 
 from nni.nas.execution import list_models, set_execution_engine
-from nni.nas.execution.common import RetiariiAdvisor, get_mutation_dict, init_execution_engine, Model
+from nni.nas.execution.api import init_execution_engine
+from nni.nas.execution.common import RetiariiAdvisor, get_mutation_dict, Model
 from nni.nas.execution.pytorch.codegen import model_to_pytorch_script
 from nni.nas.execution.pytorch.converter import convert_to_graph
 from nni.nas.execution.pytorch.converter.graph_gen import GraphConverterWithShape

From e520c0d890b598dcc7b0e6d2e1856e49080da9b5 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 9 Aug 2022 18:07:23 +0800
Subject: [PATCH 62/77] resolve comments

---
 nni/experiment/config/utils/internal.py |  6 ++++--
 nni/nas/experiment/pytorch.py           | 26 ++++++++++++++-----------
 test/ut/nas/test_experiment.py          |  1 -
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py
index 5cbe216a74..21c4f4b8b0 100644
--- a/nni/experiment/config/utils/internal.py
+++ b/nni/experiment/config/utils/internal.py
@@ -35,7 +35,9 @@
 from .public import is_missing
 
 if typing.TYPE_CHECKING:
+    from nni.nas.experiment.pytorch import RetiariiExperiment
     from nni.nas.experiment.config import RetiariiExeConfig
+    from ...experiment import Experiment
     from ..base import ConfigBase
     from ..experiment_config import ExperimentConfig
     from ..training_service import TrainingServiceConfig
@@ -202,7 +204,7 @@ def get_ipv4_address() -> str:
     s.close()
     return addr
 
-def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]:
+def load_experiment_config(config_json: dict) -> ExperimentConfig | RetiariiExeConfig:
     # avoid circular import
     from nni.nas.experiment.config import RetiariiExeConfig
     from nni.nas.experiment.pytorch import RetiariiExperiment
@@ -216,7 +218,7 @@ def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, Retiar
     else:
         raise TypeError(f'Unsupported experiment type: {type(exp_cls)}')
 
-def get_experiment_cls_using_config(config_json):
+def get_experiment_cls_using_config(config_json: dict) -> Experiment | RetiariiExperiment:
     from nni.nas.experiment.pytorch import RetiariiExperiment
     from ...experiment import Experiment
     if 'experimentType' in config_json:
diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py
index 7b5bde39ac..c31ee1e51a 100644
--- a/nni/nas/experiment/pytorch.py
+++ b/nni/nas/experiment/pytorch.py
@@ -10,7 +10,7 @@
 import time
 import warnings
 from threading import Thread
-from typing import Any, List, cast, Tuple
+from typing import Any, List, cast, Tuple, TYPE_CHECKING
 
 import colorama
 
@@ -38,6 +38,9 @@
     PyEngineConfig, CgoEngineConfig, BenchmarkEngineConfig
 )
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
 _logger = logging.getLogger(__name__)
 
 
@@ -219,9 +222,9 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None:
     def _save_experiment_checkpoint(self,
                                     base_model_ir: Model,
                                     applied_mutators: List[Mutator],
-                                    strategy: BaseStrategy) -> None:
-        ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory),
-                                self.id, 'checkpoint')
+                                    strategy: BaseStrategy,
+                                    exp_work_dir: Path) -> None:
+        ckp_path = os.path.join(exp_work_dir, self.id, 'checkpoint')
         with open(os.path.join(ckp_path, 'nas_model'), 'w') as fp:
             dump(base_model_ir._dump(), fp, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024)))
         with open(os.path.join(ckp_path, 'applied_mutators'), 'w') as fp:
@@ -229,9 +232,8 @@ def _save_experiment_checkpoint(self,
         with open(os.path.join(ckp_path, 'strategy'), 'w') as fp:
             dump(strategy, fp)
 
-    def _load_experiment_checkpoint(self) -> Tuple[Model, List[Mutator], BaseStrategy]:
-        ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory),
-                                self.id, 'checkpoint')
+    def _load_experiment_checkpoint(self, exp_work_dir: Path) -> Tuple[Model, List[Mutator], BaseStrategy]:
+        ckp_path = os.path.join(exp_work_dir, self.id, 'checkpoint')
         with open(os.path.join(ckp_path, 'nas_model'), 'r') as fp:
             base_model_ir = load(fp=fp)
             base_model_ir = Model._load(base_model_ir)
@@ -297,9 +299,11 @@ def run(self,
                         dummy_input=canonicalized_config.execution_engine.dummy_input
                             if isinstance(canonicalized_config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None
                     )
-                    self._save_experiment_checkpoint(base_model_ir, self.applied_mutators, self.strategy)
+                    self._save_experiment_checkpoint(base_model_ir, self.applied_mutators, self.strategy,
+                                                     canonicalized_config.experiment_working_directory)
                 elif self._action == 'resume':
-                    base_model_ir, self.applied_mutators, self.strategy = self._load_experiment_checkpoint()
+                    base_model_ir, self.applied_mutators, self.strategy = self._load_experiment_checkpoint(
+                        canonicalized_config.experiment_working_directory)
                 else:
                     raise RuntimeError(f'The experiment mode "{self._action}" is not supposed to invoke run() method.')
 
@@ -369,7 +373,7 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for
                 return [get_mutation_dict(model) for model in all_models[:top_k]]
 
     @staticmethod
-    def view(experiment_id: str, port: int = 8080, non_blocking: bool = False):
+    def view(experiment_id: str, port: int = 8080, non_blocking: bool = False) -> RetiariiExperiment | None:
         """
         View a stopped experiment.
 
@@ -397,7 +401,7 @@ def view(experiment_id: str, port: int = 8080, non_blocking: bool = False):
                 experiment.stop()
 
     @staticmethod
-    def resume(experiment_id: str, port: int = 8080, debug: bool = False):
+    def resume(experiment_id: str, port: int = 8080, debug: bool = False) -> RetiariiExperiment:
         """
         Resume a stopped experiment.
 
diff --git a/test/ut/nas/test_experiment.py b/test/ut/nas/test_experiment.py
index 28ea8edc00..f60693acd3 100644
--- a/test/ut/nas/test_experiment.py
+++ b/test/ut/nas/test_experiment.py
@@ -1,4 +1,3 @@
-from asyncio import subprocess
 import os
 import subprocess
 import sys

From a6cb74d763a570af50c7a969ebbfbca215bf1816 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 9 Aug 2022 19:16:58 +0800
Subject: [PATCH 63/77] fix pylint

---
 nni/experiment/config/utils/internal.py |  2 +-
 nni/nas/experiment/pytorch.py           | 33 +++++++++++--------------
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py
index 21c4f4b8b0..8e1e30b160 100644
--- a/nni/experiment/config/utils/internal.py
+++ b/nni/experiment/config/utils/internal.py
@@ -218,7 +218,7 @@ def load_experiment_config(config_json: dict) -> ExperimentConfig | RetiariiExeC
     else:
         raise TypeError(f'Unsupported experiment type: {type(exp_cls)}')
 
-def get_experiment_cls_using_config(config_json: dict) -> Experiment | RetiariiExperiment:
+def get_experiment_cls_using_config(config_json: dict) -> type[Experiment] | type[RetiariiExperiment]:
     from nni.nas.experiment.pytorch import RetiariiExperiment
     from ...experiment import Experiment
     if 'experimentType' in config_json:
diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py
index c31ee1e51a..c85102c405 100644
--- a/nni/nas/experiment/pytorch.py
+++ b/nni/nas/experiment/pytorch.py
@@ -39,7 +39,7 @@
 )
 
 if TYPE_CHECKING:
-    from pathlib import Path
+    from nni.experiment.config.utils import PathLike
 
 _logger = logging.getLogger(__name__)
 
@@ -195,9 +195,9 @@ def __init__(self, base_model: nn.Module = cast(nn.Module, None),
             # check for sanity
             if not is_model_wrapped(base_model):
                 warnings.warn(colorama.Style.BRIGHT + colorama.Fore.RED +
-                            '`@model_wrapper` is missing for the base model. The experiment might still be able to run, '
-                            'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL,
-                            RuntimeWarning)
+                    '`@model_wrapper` is missing for the base model. The experiment might still be able to run, '
+                    'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL,
+                    RuntimeWarning)
 
         self.base_model = base_model
         self.evaluator: Evaluator = evaluator
@@ -219,11 +219,8 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None:
         engine = init_execution_engine(config, self.port, self.url_prefix)
         set_execution_engine(engine)
 
-    def _save_experiment_checkpoint(self,
-                                    base_model_ir: Model,
-                                    applied_mutators: List[Mutator],
-                                    strategy: BaseStrategy,
-                                    exp_work_dir: Path) -> None:
+    def _save_experiment_checkpoint(self, base_model_ir: Model, applied_mutators: List[Mutator],
+                                    strategy: BaseStrategy, exp_work_dir: PathLike) -> None:
         ckp_path = os.path.join(exp_work_dir, self.id, 'checkpoint')
         with open(os.path.join(ckp_path, 'nas_model'), 'w') as fp:
             dump(base_model_ir._dump(), fp, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024)))
@@ -232,7 +229,7 @@ def _save_experiment_checkpoint(self,
         with open(os.path.join(ckp_path, 'strategy'), 'w') as fp:
             dump(strategy, fp)
 
-    def _load_experiment_checkpoint(self, exp_work_dir: Path) -> Tuple[Model, List[Mutator], BaseStrategy]:
+    def _load_experiment_checkpoint(self, exp_work_dir: PathLike) -> Tuple[Model, List[Mutator], BaseStrategy]:
         ckp_path = os.path.join(exp_work_dir, self.id, 'checkpoint')
         with open(os.path.join(ckp_path, 'nas_model'), 'r') as fp:
             base_model_ir = load(fp=fp)
@@ -284,26 +281,26 @@ def run(self,
             self.strategy.run(base_model_ir, self.applied_mutators)
         else:
             ws_url = f'ws://localhost:{port}/tuner'
-            canonicalized_config = self._start_impl(port, debug, RunMode.Background, ws_url, ['retiarii'])
-            canonicalized_config = cast(RetiariiExeConfig, canonicalized_config)
+            canoni_conf = self._start_impl(port, debug, RunMode.Background, ws_url, ['retiarii'])
+            canoni_conf = cast(RetiariiExeConfig, canoni_conf)
             self._dispatcher = RetiariiAdvisor(ws_url)
             self._dispatcher_thread = Thread(target=self._dispatcher.run, daemon=True)
             self._dispatcher_thread.start()
             # FIXME: engine cannot be created twice
-            self._create_execution_engine(canonicalized_config)
+            self._create_execution_engine(canoni_conf)
             try:
                 if self._action == 'create':
                     base_model_ir, self.applied_mutators = preprocess_model(
                         self.base_model, self.evaluator, self.applied_mutators,
-                        full_ir=not isinstance(canonicalized_config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)),
-                        dummy_input=canonicalized_config.execution_engine.dummy_input
-                            if isinstance(canonicalized_config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None
+                        full_ir=not isinstance(canoni_conf.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)),
+                        dummy_input=canoni_conf.execution_engine.dummy_input
+                            if isinstance(canoni_conf.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None
                     )
                     self._save_experiment_checkpoint(base_model_ir, self.applied_mutators, self.strategy,
-                                                     canonicalized_config.experiment_working_directory)
+                                                     canoni_conf.experiment_working_directory)
                 elif self._action == 'resume':
                     base_model_ir, self.applied_mutators, self.strategy = self._load_experiment_checkpoint(
-                        canonicalized_config.experiment_working_directory)
+                        canoni_conf.experiment_working_directory)
                 else:
                     raise RuntimeError(f'The experiment mode "{self._action}" is not supposed to invoke run() method.')
 

From e6f0fea4ffee35b325ea2b96a52d207050d42d49 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Wed, 10 Aug 2022 11:17:29 +0800
Subject: [PATCH 64/77] resolve comments

---
 nni/experiment/config/utils/internal.py | 42 ++++++++++++-------------
 nni/nas/experiment/pytorch.py           |  9 +++---
 nni/tools/nnictl/launcher.py            | 23 +++++++-------
 test/ut/nas/test_experiment.py          |  4 ++-
 4 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py
index 8e1e30b160..88c7ee3c0d 100644
--- a/nni/experiment/config/utils/internal.py
+++ b/nni/experiment/config/utils/internal.py
@@ -26,7 +26,7 @@
 import os.path
 from pathlib import Path
 import socket
-import typing
+from typing import Tuple, TYPE_CHECKING, get_type_hints
 
 import typeguard
 
@@ -34,7 +34,7 @@
 
 from .public import is_missing
 
-if typing.TYPE_CHECKING:
+if TYPE_CHECKING:
     from nni.nas.experiment.pytorch import RetiariiExperiment
     from nni.nas.experiment.config import RetiariiExeConfig
     from ...experiment import Experiment
@@ -83,7 +83,7 @@ def fields(config: ConfigBase) -> list[dataclasses.Field]:
     # Similar to `dataclasses.fields()`, but use `typing.get_types_hints()` to get `field.type`.
     # This is useful when postponed evaluation is enabled.
     ret = [copy.copy(field) for field in dataclasses.fields(config)]
-    types = typing.get_type_hints(type(config))
+    types = get_type_hints(type(config))
     for field in ret:
         field.type = types[field.name]
     return ret
@@ -205,31 +205,29 @@ def get_ipv4_address() -> str:
     return addr
 
 def load_experiment_config(config_json: dict) -> ExperimentConfig | RetiariiExeConfig:
-    # avoid circular import
-    from nni.nas.experiment.config import RetiariiExeConfig
-    from nni.nas.experiment.pytorch import RetiariiExperiment
-    from ..experiment_config import ExperimentConfig
-    from ...experiment import Experiment
-    exp_cls = get_experiment_cls_using_config(config_json)
-    if exp_cls is Experiment:
-        return ExperimentConfig(**config_json)
-    elif exp_cls is RetiariiExperiment:
-        return RetiariiExeConfig(**config_json)
-    else:
-        raise TypeError(f'Unsupported experiment type: {type(exp_cls)}')
+    _, exp_conf_cls = get_experiment_cls_using_config(config_json)
+    return exp_conf_cls(**config_json)
 
-def get_experiment_cls_using_config(config_json: dict) -> type[Experiment] | type[RetiariiExperiment]:
-    from nni.nas.experiment.pytorch import RetiariiExperiment
-    from ...experiment import Experiment
+def get_experiment_cls_using_config(config_json: dict) -> Tuple[type[Experiment] | type[RetiariiExperiment],
+                                                                type[ExperimentConfig] | type[RetiariiExeConfig]]:
+    # avoid circular import and unnecessary dependency on pytorch
     if 'experimentType' in config_json:
         if config_json['experimentType'] == 'hpo':
-            return Experiment
+            from ...experiment import Experiment
+            from ..experiment_config import ExperimentConfig
+            return Experiment, ExperimentConfig
         elif config_json['experimentType'] == 'nas':
-            return RetiariiExperiment
+            from nni.nas.experiment.pytorch import RetiariiExperiment
+            from nni.nas.experiment.config import RetiariiExeConfig
+            return RetiariiExperiment, RetiariiExeConfig
         else:
             raise ValueError(f'Unknown experiment_type: {config_json["experimentType"]}')
     else:
         if 'executionEngine' in config_json:
-            return RetiariiExperiment
+            from nni.nas.experiment.pytorch import RetiariiExperiment
+            from nni.nas.experiment.config import RetiariiExeConfig
+            return RetiariiExperiment, RetiariiExeConfig
         else:
-            return Experiment
+            from ...experiment import Experiment
+            from ..experiment_config import ExperimentConfig
+            return Experiment, ExperimentConfig
diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py
index c85102c405..60c5939176 100644
--- a/nni/nas/experiment/pytorch.py
+++ b/nni/nas/experiment/pytorch.py
@@ -258,11 +258,10 @@ def run(self,
         """
         from nni.retiarii.oneshot.interface import BaseOneShotTrainer
         if isinstance(self.evaluator, BaseOneShotTrainer):
-            # TODO: will throw a deprecation warning soon
-            # warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. '
-            #               'We will try to convert this trainer to our new implementation to run the algorithm. '
-            #               'In case you want to stick to the old implementation, '
-            #               'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning)
+            warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. '
+                          'We will try to convert this trainer to our new implementation to run the algorithm. '
+                          'In case you want to stick to the old implementation, '
+                          'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning)
             self.evaluator.fit()
             return
 
diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py
index c9f4860672..72219669fa 100644
--- a/nni/tools/nnictl/launcher.py
+++ b/nni/tools/nnictl/launcher.py
@@ -11,7 +11,6 @@
 
 from nni.experiment import Experiment, RunMode
 from nni.experiment.config import ExperimentConfig, convert, utils
-from nni.nas.experiment.pytorch import RetiariiExperiment
 from nni.tools.annotation import expand_annotations, generate_search_space
 
 # used for v1-only legacy setup, remove them later
@@ -105,13 +104,14 @@ def resume_experiment(args):
         legacy_launcher.resume_experiment(args)
         exit()
 
-    exp_class = utils.get_experiment_cls_using_config(config_json)
-    if exp_class is RetiariiExperiment:
-        RetiariiExperiment.resume(exp_id, port, debug)
-    else:
-        exp = Experiment._resume(exp_id, exp_dir)
+    exp_cls, _ = utils.get_experiment_cls_using_config(config_json)
+    if exp_cls is Experiment:
+        exp = exp_cls._resume(exp_id, exp_dir)
         run_mode = RunMode.Foreground if foreground else RunMode.Detach
         exp.start(port, debug, run_mode)
+    else:
+        # exp_cls is RetiariiExperiment
+        exp_cls.resume(exp_id, port, debug)
 
 def view_experiment(args):
     exp_id = args.id
@@ -123,9 +123,10 @@ def view_experiment(args):
         legacy_launcher.view_experiment(args)
         exit()
 
-    exp_class = utils.get_experiment_cls_using_config(config_json)
-    if exp_class is RetiariiExperiment:
-        RetiariiExperiment.view(exp_id, port, non_blocking=True)
-    else:
-        exp = Experiment._view(exp_id, exp_dir)
+    exp_cls, _ = utils.get_experiment_cls_using_config(config_json)
+    if exp_cls is Experiment:
+        exp = exp_cls._view(exp_id, exp_dir)
         exp.start(port, run_mode=RunMode.Detach)
+    else:
+        # exp_cls is RetiariiExperiment
+        exp_cls.view(exp_id, port, non_blocking=True)
diff --git a/test/ut/nas/test_experiment.py b/test/ut/nas/test_experiment.py
index f60693acd3..8640d58c17 100644
--- a/test/ut/nas/test_experiment.py
+++ b/test/ut/nas/test_experiment.py
@@ -113,6 +113,7 @@ def test_multitrial_experiment(pytestconfig):
     assert isinstance(exp.export_top_models()[0], dict)
     exp.stop()
 
+
 def test_multitrial_experiment_resume_view(pytestconfig):
     # start a normal nas experiment
     base_model = Net()
@@ -145,7 +146,7 @@ def test_multitrial_experiment_resume_view(pytestconfig):
     # the following is nnictl resume and view
     print('nnictl resume...')
     new_env = os.environ.copy()
-    new_env['PYTHONPATH'] = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
+    new_env['PYTHONPATH'] = pytestconfig.rootpath
     proc = subprocess.run(f'nnictl resume {exp_id}', shell=True, env=new_env)
     assert proc.returncode == 0, 'resume nas experiment failed with code %d' % proc.returncode
     print('nnictl view...')
@@ -154,6 +155,7 @@ def test_multitrial_experiment_resume_view(pytestconfig):
     proc = subprocess.run(f'nnictl stop {exp_id}', shell=True)
     assert proc.returncode == 0, 'stop viewed nas experiment failed with code %d' % proc.returncode
 
+
 def test_oneshot_experiment():
     base_model = Net()
     evaluator = get_mnist_evaluator()

From f4faa739c006cc5c9115dd01e0151e9f3da470f6 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Wed, 10 Aug 2022 12:48:15 +0800
Subject: [PATCH 65/77] move test

---
 test/algo/nas/test_multitrial.py | 43 +++++++++++++++++++++++++++++++-
 test/ut/nas/test_experiment.py   | 43 --------------------------------
 2 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/test/algo/nas/test_multitrial.py b/test/algo/nas/test_multitrial.py
index cb7d395aea..14ccd4bf39 100644
--- a/test/algo/nas/test_multitrial.py
+++ b/test/algo/nas/test_multitrial.py
@@ -1,6 +1,6 @@
 import multiprocessing
 import os
-import sys
+import subprocess
 import time
 
 import pytest
@@ -76,3 +76,44 @@ def test_exp_exit_without_stop(pytestconfig):
             return
     process.kill()
     raise RuntimeError(f'Experiment fails to stop in {timeout} seconds.')
+
+
+def test_multitrial_experiment_resume_view(pytestconfig):
+    # start a normal nas experiment
+    base_model, evaluator = _mnist_net('simple', {'max_epochs': 1})
+    search_strategy = strategy.Random()
+    exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy)
+    exp_id = exp.id
+    exp_config = RetiariiExeConfig('local')
+    exp_config.trial_concurrency = 1
+    exp_config.max_trial_number = 1
+    exp_config._trial_command_params = nas_experiment_trial_params(pytestconfig.rootpath)
+    exp.run(exp_config)
+    ensure_success(exp)
+    assert isinstance(exp.export_top_models()[0], dict)
+    exp.stop()
+
+    # resume the above nas experiment. only tested the resume logic in the python side,
+    # as no more trial is executed after resume, the above experiment is already finished
+    print('python api resume...')
+    exp = RetiariiExperiment.resume(exp_id)
+    ensure_success(exp)
+    # TODO: currently `export_top_models` does not work as strategy's states are not resumed
+    # assert isinstance(exp.export_top_models()[0], dict)
+    exp.stop()
+    # view the above experiment in non blocking mode then stop it
+    print('python api view...')
+    exp = RetiariiExperiment.view(exp_id, non_blocking=True)
+    exp.stop()
+
+    # the following is nnictl resume and view
+    print('nnictl resume...')
+    new_env = os.environ.copy()
+    new_env['PYTHONPATH'] = str(pytestconfig.rootpath)
+    proc = subprocess.run(f'nnictl resume {exp_id}', shell=True, env=new_env)
+    assert proc.returncode == 0, 'resume nas experiment failed with code %d' % proc.returncode
+    print('nnictl view...')
+    proc = subprocess.run(f'nnictl view {exp_id}', shell=True)
+    assert proc.returncode == 0, 'view nas experiment failed with code %d' % proc.returncode
+    proc = subprocess.run(f'nnictl stop {exp_id}', shell=True)
+    assert proc.returncode == 0, 'stop viewed nas experiment failed with code %d' % proc.returncode
\ No newline at end of file
diff --git a/test/ut/nas/test_experiment.py b/test/ut/nas/test_experiment.py
index 8640d58c17..991e3b23f6 100644
--- a/test/ut/nas/test_experiment.py
+++ b/test/ut/nas/test_experiment.py
@@ -1,5 +1,4 @@
 import os
-import subprocess
 import sys
 
 import nni
@@ -114,48 +113,6 @@ def test_multitrial_experiment(pytestconfig):
     exp.stop()
 
 
-def test_multitrial_experiment_resume_view(pytestconfig):
-    # start a normal nas experiment
-    base_model = Net()
-    evaluator = get_mnist_evaluator()
-    search_strategy = strategy.Random()
-    exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy)
-    exp_id = exp.id
-    exp_config = RetiariiExeConfig('local')
-    exp_config.trial_concurrency = 1
-    exp_config.max_trial_number = 1
-    exp_config._trial_command_params = nas_experiment_trial_params(pytestconfig.rootpath)
-    exp.run(exp_config)
-    ensure_success(exp)
-    assert isinstance(exp.export_top_models()[0], dict)
-    exp.stop()
-
-    # resume the above nas experiment. only tested the resume logic in the python side,
-    # as no more trial is executed after resume, the above experiment is already finished
-    print('python api resume...')
-    exp = RetiariiExperiment.resume(exp_id)
-    ensure_success(exp)
-    # TODO: currently `export_top_models` does not work as strategy's states are not resumed
-    # assert isinstance(exp.export_top_models()[0], dict)
-    exp.stop()
-    # view the above experiment in non blocking mode then stop it
-    print('python api view...')
-    exp = RetiariiExperiment.view(exp_id, non_blocking=True)
-    exp.stop()
-
-    # the following is nnictl resume and view
-    print('nnictl resume...')
-    new_env = os.environ.copy()
-    new_env['PYTHONPATH'] = pytestconfig.rootpath
-    proc = subprocess.run(f'nnictl resume {exp_id}', shell=True, env=new_env)
-    assert proc.returncode == 0, 'resume nas experiment failed with code %d' % proc.returncode
-    print('nnictl view...')
-    proc = subprocess.run(f'nnictl view {exp_id}', shell=True)
-    assert proc.returncode == 0, 'view nas experiment failed with code %d' % proc.returncode
-    proc = subprocess.run(f'nnictl stop {exp_id}', shell=True)
-    assert proc.returncode == 0, 'stop viewed nas experiment failed with code %d' % proc.returncode
-
-
 def test_oneshot_experiment():
     base_model = Net()
     evaluator = get_mnist_evaluator()

From eab49a62a66ffba8fee0b95afd80812102c39152 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Fri, 26 Aug 2022 20:00:50 +0800
Subject: [PATCH 66/77] fix bug and add doc

---
 README.md                                   |  3 ++-
 docs/source/notes/research_publications.rst | 26 +++++++++++++++++++++
 nni/nas/execution/common/integration.py     |  3 +++
 test/algo/nas/test_multitrial.py            |  7 ++++++
 4 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1c9eae6fce..e3bf4eb92f 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,8 @@ NNI automates feature engineering, neural architecture search, hyperparameter tu
 
 * **New release**: [v2.8 is available](https://github.com/microsoft/nni/releases/tag/v2.8) - _released on June-22-2022_
 * **New demo available**: [Youtube entry](https://www.youtube.com/channel/UCKcafm6861B2mnYhPbZHavw) | [Bilibili 入口](https://space.bilibili.com/1649051673) - _last updated on June-22-2022_
-* **New webinar**: [Introducing Retiarii: A deep learning exploratory-training framework on NNI](https://note.microsoft.com/MSR-Webinar-Retiarii-Registration-Live.html) - _scheduled on June-24-2021_
+* **New research paper**: [SparTA: Deep-Learning Model Sparsity via Tensor-with-Sparsity-Attribute](https://www.usenix.org/system/files/osdi22-zheng-ningxin.pdf) - _published in OSDI 2022_
+* **New research paper**: [Privacy-preserving Online AutoML for Domain-Specific Face Detection](https://openaccess.thecvf.com/content/CVPR2022/papers/Yan_Privacy-Preserving_Online_AutoML_for_Domain-Specific_Face_Detection_CVPR_2022_paper.pdf) - _published in CVPR 2022_
 * **Newly upgraded documentation**: [Doc upgraded](https://nni.readthedocs.io/en/stable)
 
 
diff --git a/docs/source/notes/research_publications.rst b/docs/source/notes/research_publications.rst
index 1ca12930e0..9ab0d6eaaa 100644
--- a/docs/source/notes/research_publications.rst
+++ b/docs/source/notes/research_publications.rst
@@ -7,6 +7,19 @@ System Research
 ---------------
 
 
+* `SparTA: Deep-Learning Model Sparsity via Tensor-with-Sparsity-Attribute <https://www.usenix.org/system/files/osdi22-zheng-ningxin.pdf>`__
+
+.. code-block:: bibtex
+
+   @inproceedings{zheng2022sparta,
+     title={$\{$SparTA$\}$:$\{$Deep-Learning$\}$ Model Sparsity via $\{$Tensor-with-Sparsity-Attribute$\}$},
+     author={Zheng, Ningxin and Lin, Bin and Zhang, Quanlu and Ma, Lingxiao and Yang, Yuqing and Yang, Fan and Wang, Yang and Yang, Mao and Zhou, Lidong},
+     booktitle={16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)},
+     pages={213--232},
+     year={2022}
+   }
+
+
 * `Retiarii: A Deep Learning Exploratory-Training Framework <https://www.usenix.org/system/files/osdi20-zhang_quanlu.pdf>`__
 
 .. code-block:: bibtex
@@ -52,6 +65,19 @@ New Algorithms
 ^^^^^^^^^^^^^^
 
 
+* `Privacy-preserving Online AutoML for Domain-Specific Face Detection <https://openaccess.thecvf.com/content/CVPR2022/papers/Yan_Privacy-Preserving_Online_AutoML_for_Domain-Specific_Face_Detection_CVPR_2022_paper.pdf>`__
+
+.. code-block:: bibtex
+
+   @inproceedings{yan2022privacy,
+     title={Privacy-preserving Online AutoML for Domain-Specific Face Detection},
+     author={Yan, Chenqian and Zhang, Yuge and Zhang, Quanlu and Yang, Yaming and Jiang, Xinyang and Yang, Yuqing and Wang, Baoyuan},
+     booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+     pages={4134--4144},
+     year={2022}
+   }
+
+
 * `TextNAS: A Neural Architecture Search Space Tailored for Text Representation <https://arxiv.org/pdf/1912.10729.pdf>`__
 
 .. code-block:: bibtex
diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py
index ce04b79f5a..3deb319ea8 100644
--- a/nni/nas/execution/common/integration.py
+++ b/nni/nas/execution/common/integration.py
@@ -237,3 +237,6 @@ def _process_value(value) -> Any:  # hopefully a float
     def handle_import_data(self, data):
         # FIXME: ignore imported data for now, as strategy has not supported resume
         pass
+
+    def handle_add_customized_trial(self, data):
+        pass
diff --git a/test/algo/nas/test_multitrial.py b/test/algo/nas/test_multitrial.py
index 14ccd4bf39..81de742d2d 100644
--- a/test/algo/nas/test_multitrial.py
+++ b/test/algo/nas/test_multitrial.py
@@ -98,18 +98,25 @@ def test_multitrial_experiment_resume_view(pytestconfig):
     print('python api resume...')
     exp = RetiariiExperiment.resume(exp_id)
     ensure_success(exp)
+    # sleep here because there would be several seconds for the experiment status to change
+    # to ERROR from INITIALIZED/RUNNING if the resume gets error.
+    time.sleep(5)
+    assert exp.get_status() == 'DONE', f'The experiment status should not be {exp.get_status()}'
     # TODO: currently `export_top_models` does not work as strategy's states are not resumed
     # assert isinstance(exp.export_top_models()[0], dict)
     exp.stop()
     # view the above experiment in non blocking mode then stop it
     print('python api view...')
     exp = RetiariiExperiment.view(exp_id, non_blocking=True)
+    assert exp.get_status() == 'VIEWED', f'The experiment status should not be {exp.get_status()}'
     exp.stop()
 
     # the following is nnictl resume and view
     print('nnictl resume...')
     new_env = os.environ.copy()
     new_env['PYTHONPATH'] = str(pytestconfig.rootpath)
+    # NOTE: experiment status (e.g., ERROR) is not checked, because it runs in blocking mode and
+    # the rest server exits right after the command is done
     proc = subprocess.run(f'nnictl resume {exp_id}', shell=True, env=new_env)
     assert proc.returncode == 0, 'resume nas experiment failed with code %d' % proc.returncode
     print('nnictl view...')

From bccb126beb025ffd53d2d90ea884bb33cb924678 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 29 Aug 2022 13:48:21 +0800
Subject: [PATCH 67/77] resolve comment

---
 test/algo/nas/test_multitrial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/algo/nas/test_multitrial.py b/test/algo/nas/test_multitrial.py
index 81de742d2d..0257d6ba02 100644
--- a/test/algo/nas/test_multitrial.py
+++ b/test/algo/nas/test_multitrial.py
@@ -100,7 +100,7 @@ def test_multitrial_experiment_resume_view(pytestconfig):
     ensure_success(exp)
     # sleep here because there would be several seconds for the experiment status to change
     # to ERROR from INITIALIZED/RUNNING if the resume gets error.
-    time.sleep(5)
+    time.sleep(6)
     assert exp.get_status() == 'DONE', f'The experiment status should not be {exp.get_status()}'
     # TODO: currently `export_top_models` does not work as strategy's states are not resumed
     # assert isinstance(exp.export_top_models()[0], dict)

From 4d1778318773629dc68ee18cc3bc1a48f4d98b67 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 5 Sep 2022 14:42:02 +0800
Subject: [PATCH 68/77] fix issue

---
 nni/nas/execution/common/integration.py | 31 ++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py
index 3deb319ea8..42f41c451d 100644
--- a/nni/nas/execution/common/integration.py
+++ b/nni/nas/execution/common/integration.py
@@ -60,7 +60,8 @@ def __init__(self, url: str):
         self.final_metric_callback: Optional[Callable[[int, MetricData], None]] = None
 
         self.parameters_count = 0
-
+        # for dealing with the resumed running trials of the before-resumed experiment
+        self.previous_max_param_id = 0
         # Sometimes messages arrive first before the callbacks get registered.
         # Or in case that we allow engine to be absent during the experiment.
         # Here we need to store the messages and invoke them later.
@@ -212,10 +213,22 @@ def handle_update_search_space(self, data):
         self.search_space = data
 
     def handle_trial_end(self, data):
+        # TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring
+        id_ = nni.load(data['hyper_params'])['parameter_id']
+        if id_ <= self.previous_max_param_id:
+            _logger.info('The end of the recovered trial %d is ignored', id_)
+            return
         _logger.debug('Trial end: %s', data)
-        self.invoke_callback('trial_end', nni.load(data['hyper_params'])['parameter_id'], data['event'] == 'SUCCEEDED')
+        self.invoke_callback('trial_end', id_, data['event'] == 'SUCCEEDED')
 
     def handle_report_metric_data(self, data):
+        # TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring
+        if data['parameter_id'] <= self.previous_max_param_id:
+            _logger.info('The metrics of the recovered trial %d are ignored', data['parameter_id'])
+            return
+        # NOTE: this part is not aligned with hpo tuners.
+        # in hpo tuners, trial_job_id is used for intermediate results handling
+        # parameter_id is for final result handling.
         _logger.debug('Metric reported: %s', data)
         if data['type'] == MetricType.REQUEST_PARAMETER:
             raise ValueError('Request parameter not supported')
@@ -239,4 +252,16 @@ def handle_import_data(self, data):
         pass
 
     def handle_add_customized_trial(self, data):
-        pass
+        # this is for handling the resuming of the interrupted data: parameters
+        if not isinstance(data, list):
+            data = [data]
+
+        for trial in data:
+            # {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...}
+            if isinstance(trial, str):
+                trial = nni.load(trial)
+            if self.previous_max_param_id < trial['parameter_id']:
+                self.previous_max_param_id = trial['parameter_id']
+        self.parameters_count = self.previous_max_param_id
+
+        # TODO: handle customized trials

From 097a7817213c5f42b918054aa0de9bd379f37f35 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 5 Sep 2022 21:24:37 +0800
Subject: [PATCH 69/77] fix

---
 .../hpo/bohb_advisor/bohb_advisor.py          | 15 +++++++++-
 nni/algorithms/hpo/hyperband_advisor.py       | 11 ++++++-
 nni/algorithms/hpo/tpe_tuner.py               | 13 --------
 nni/nas/execution/common/integration.py       | 28 +++++++----------
 nni/nas/execution/common/integration_api.py   | 10 ++++++-
 nni/recoverable.py                            | 30 +++++++++++++++++++
 nni/runtime/msg_dispatcher.py                 | 21 ++++++++-----
 nni/runtime/msg_dispatcher_base.py            |  1 +
 nni/tuner.py                                  |  8 -----
 9 files changed, 88 insertions(+), 49 deletions(-)

diff --git a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
index 9127ddaddc..56adad67f2 100644
--- a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
+++ b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
@@ -648,6 +648,9 @@ def handle_trial_end(self, data):
             event: the job's state
             hyper_params: the hyperparameters (a string) generated and returned by tuner
         """
+        if self.is_created_in_previous_exp(data['parameter_id']):
+            # The end of the recovered trial is ignored
+            return
         logger.debug('Tuner handle trial end, result is %s', data)
         hyper_params = nni.load(data['hyper_params'])
         self._handle_trial_end(hyper_params['parameter_id'])
@@ -695,6 +698,13 @@ def handle_report_metric_data(self, data):
         ValueError
             Data type not supported
         """
+        if self.is_created_in_previous_exp(data['parameter_id']):
+            if data['type'] == MetricType.FINAL:
+                # only deal with final metric using import data
+                param = self.get_previous_param(data['parameter_id'])
+                trial_data = [{'parameter': param, 'value': nni.load(data['value'])}]
+                self.handle_import_data(trial_data)
+            return
         logger.debug('handle report metric data = %s', data)
         if 'value' in data:
             data['value'] = nni.load(data['value'])
@@ -752,7 +762,10 @@ def handle_report_metric_data(self, data):
                     'Data type not supported: {}'.format(data['type']))
 
     def handle_add_customized_trial(self, data):
-        pass
+        global _next_parameter_id
+        # data: parameters
+        previous_max_param_id = self.recover_parameter_id(data)
+        _next_parameter_id = previous_max_param_id + 1
 
     def handle_import_data(self, data):
         """Import additional data for tuning
diff --git a/nni/algorithms/hpo/hyperband_advisor.py b/nni/algorithms/hpo/hyperband_advisor.py
index 899696137d..f60273fc27 100644
--- a/nni/algorithms/hpo/hyperband_advisor.py
+++ b/nni/algorithms/hpo/hyperband_advisor.py
@@ -521,6 +521,9 @@ def handle_trial_end(self, data):
             event: the job's state
             hyper_params: the hyperparameters (a string) generated and returned by tuner
         """
+        if self.is_created_in_previous_exp(data['parameter_id']):
+            # The end of the recovered trial is ignored
+            return
         hyper_params = nni.load(data['hyper_params'])
         self._handle_trial_end(hyper_params['parameter_id'])
         if data['trial_job_id'] in self.job_id_para_id_map:
@@ -538,6 +541,9 @@ def handle_report_metric_data(self, data):
         ValueError
             Data type not supported
         """
+        if self.is_created_in_previous_exp(data['parameter_id']):
+            # do not support recovering the algorithm state 
+            return
         if 'value' in data:
             data['value'] = nni.load(data['value'])
         # multiphase? need to check
@@ -576,7 +582,10 @@ def handle_report_metric_data(self, data):
                 raise ValueError('Data type not supported: {}'.format(data['type']))
 
     def handle_add_customized_trial(self, data):
-        pass
+        global _next_parameter_id
+        # data: parameters
+        previous_max_param_id = self.recover_parameter_id(data)
+        _next_parameter_id = previous_max_param_id + 1
 
     def handle_import_data(self, data):
         pass
diff --git a/nni/algorithms/hpo/tpe_tuner.py b/nni/algorithms/hpo/tpe_tuner.py
index 463c18f7d9..b9df04c3a4 100644
--- a/nni/algorithms/hpo/tpe_tuner.py
+++ b/nni/algorithms/hpo/tpe_tuner.py
@@ -218,19 +218,6 @@ def import_data(self, data):  # for resuming experiment
                 self.dedup.add_history(param)
         _logger.info(f'Replayed {len(data)} FINISHED trials')
 
-    def import_customized_data(self, data): # for dedup customized / resumed
-        if isinstance(data, str):
-            data = nni.load(data)
-
-        for trial in data:
-            # {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...}
-            if isinstance(trial, str):
-                trial = nni.load(trial)
-            param = format_parameters(trial['parameters'], self.space)
-            self._running_params[trial['parameter_id']] = param
-            self.dedup.add_history(param)
-        _logger.info(f'Replayed {len(data)} RUNING/WAITING trials')
-
 def suggest(args, rng, space, history):
     params = {}
     for key, spec in space.items():
diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py
index 42f41c451d..7f34021ad9 100644
--- a/nni/nas/execution/common/integration.py
+++ b/nni/nas/execution/common/integration.py
@@ -60,12 +60,16 @@ def __init__(self, url: str):
         self.final_metric_callback: Optional[Callable[[int, MetricData], None]] = None
 
         self.parameters_count = 0
-        # for dealing with the resumed running trials of the before-resumed experiment
-        self.previous_max_param_id = 0
         # Sometimes messages arrive first before the callbacks get registered.
         # Or in case that we allow engine to be absent during the experiment.
         # Here we need to store the messages and invoke them later.
         self.call_queue: List[Tuple[str, list]] = []
+        # this is for waiting the to-be-recovered trials from nnimanager
+        self._advisor_initialized = False
+
+    @property
+    def initialized(self):
+        return self._advisor_initialized
 
     def register_callbacks(self, callbacks: Dict[str, Callable[..., None]]):
         """
@@ -215,7 +219,7 @@ def handle_update_search_space(self, data):
     def handle_trial_end(self, data):
         # TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring
         id_ = nni.load(data['hyper_params'])['parameter_id']
-        if id_ <= self.previous_max_param_id:
+        if self.is_created_in_previous_exp(id_):
             _logger.info('The end of the recovered trial %d is ignored', id_)
             return
         _logger.debug('Trial end: %s', data)
@@ -223,7 +227,7 @@ def handle_trial_end(self, data):
 
     def handle_report_metric_data(self, data):
         # TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring
-        if data['parameter_id'] <= self.previous_max_param_id:
+        if self.is_created_in_previous_exp(data['parameter_id']):
             _logger.info('The metrics of the recovered trial %d are ignored', data['parameter_id'])
             return
         # NOTE: this part is not aligned with hpo tuners.
@@ -252,16 +256,6 @@ def handle_import_data(self, data):
         pass
 
     def handle_add_customized_trial(self, data):
-        # this is for handling the resuming of the interrupted data: parameters
-        if not isinstance(data, list):
-            data = [data]
-
-        for trial in data:
-            # {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...}
-            if isinstance(trial, str):
-                trial = nni.load(trial)
-            if self.previous_max_param_id < trial['parameter_id']:
-                self.previous_max_param_id = trial['parameter_id']
-        self.parameters_count = self.previous_max_param_id
-
-        # TODO: handle customized trials
+        previous_max_param_id = self.recover_parameter_id(data)
+        self.parameters_count = previous_max_param_id
+        self._advisor_initialized = True
diff --git a/nni/nas/execution/common/integration_api.py b/nni/nas/execution/common/integration_api.py
index f7f08adb31..37c381bf6b 100644
--- a/nni/nas/execution/common/integration_api.py
+++ b/nni/nas/execution/common/integration_api.py
@@ -6,12 +6,16 @@
     '_advisor'  # FIXME: hack to make it importable for tests
 ]
 
+import logging
+import time
 import warnings
 from typing import NewType, Any
 
 import nni
 from nni.common.version import version_check
 
+_logger = logging.getLogger(__name__)
+
 # NOTE: this is only for passing flake8, we cannot import RetiariiAdvisor
 # because it would induce cycled import
 RetiariiAdvisor = NewType('RetiariiAdvisor', Any)
@@ -41,7 +45,11 @@ def send_trial(parameters: dict, placement_constraint=None) -> int:
     Send a new trial. Executed on tuner end.
     Return a ID that is the unique identifier for this trial.
     """
-    return get_advisor().send_trial(parameters, placement_constraint)
+    advisor = get_advisor()
+    while not advisor.initialized:
+        _logger.info('Wait for RetiariiAdvisor to be initialized...')
+        time.sleep(0.5)
+    return advisor.send_trial(parameters, placement_constraint)
 
 def receive_trial_parameters() -> dict:
     """
diff --git a/nni/recoverable.py b/nni/recoverable.py
index 4ff419b8f9..2992e0e172 100644
--- a/nni/recoverable.py
+++ b/nni/recoverable.py
@@ -4,8 +4,12 @@
 from __future__ import annotations
 
 import os
+import nni
 
 class Recoverable:
+    def __init__(self):
+        self.recovered_max_param_id = -1
+        self.recovered_trial_params = {}
 
     def load_checkpoint(self) -> None:
         pass
@@ -18,3 +22,29 @@ def get_checkpoint_path(self) -> str | None:
         if ckp_path is not None and os.path.isdir(ckp_path):
             return ckp_path
         return None
+
+    def recover_parameter_id(self, data) -> int:
+        # this is for handling the resuming of the interrupted data: parameters
+        if not isinstance(data, list):
+            data = [data]
+
+        previous_max_param_id = 0
+        for trial in data:
+            # {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...}
+            if isinstance(trial, str):
+                trial = nni.load(trial)
+            if not isinstance(trial['parameter_id'], int):
+                # for dealing with user customized trials
+                # skip for now
+                continue
+            self.recovered_trial_params[trial['parameter_id']] = trial['parameters']
+            if previous_max_param_id < trial['parameter_id']:
+                previous_max_param_id = trial['parameter_id']
+        self.recovered_max_param_id = previous_max_param_id
+        return previous_max_param_id
+
+    def is_created_in_previous_exp(self, param_id: int) -> bool:
+        return param_id <= self.recovered_max_param_id
+
+    def get_previous_param(self, param_id: int) -> dict:
+        return self.recovered_trial_params[param_id]
\ No newline at end of file
diff --git a/nni/runtime/msg_dispatcher.py b/nni/runtime/msg_dispatcher.py
index b337d879db..dd62c1e9a1 100644
--- a/nni/runtime/msg_dispatcher.py
+++ b/nni/runtime/msg_dispatcher.py
@@ -120,15 +120,10 @@ def handle_import_data(self, data):
         self.tuner.import_data(data)
 
     def handle_add_customized_trial(self, data):
+        global _next_parameter_id
         # data: parameters
-        if not isinstance(data, list):
-            data = [data]
-
-        for _ in data:
-            id_ = _create_parameter_id()
-            _customized_parameter_ids.add(id_)
-
-        self.tuner.import_customized_data(data)
+        previous_max_param_id = self.recover_parameter_id(data)
+        _next_parameter_id = previous_max_param_id + 1
 
     def handle_report_metric_data(self, data):
         """
@@ -137,6 +132,13 @@ def handle_report_metric_data(self, data):
               - 'value': metric value reported by nni.report_final_result()
               - 'type': report type, support {'FINAL', 'PERIODICAL'}
         """
+        if self.is_created_in_previous_exp(data['parameter_id']):
+            if data['type'] == MetricType.FINAL:
+                # only deal with final metric using import data
+                param = self.get_previous_param(data['parameter_id'])
+                trial_data = [{'parameter': param, 'value': load(data['value'])}]
+                self.handle_import_data(trial_data)
+            return
         # metrics value is dumped as json string in trial, so we need to decode it here
         if 'value' in data:
             data['value'] = load(data['value'])
@@ -166,6 +168,9 @@ def handle_trial_end(self, data):
              - event: the job's state
              - hyper_params: the hyperparameters generated and returned by tuner
         """
+        if self.is_created_in_previous_exp(data['parameter_id']):
+            # The end of the recovered trial is ignored
+            return
         trial_job_id = data['trial_job_id']
         _ended_trials.add(trial_job_id)
         if trial_job_id in _trial_history:
diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py
index 99e6c71c91..ac6a3b37e3 100644
--- a/nni/runtime/msg_dispatcher_base.py
+++ b/nni/runtime/msg_dispatcher_base.py
@@ -30,6 +30,7 @@ class MsgDispatcherBase(Recoverable):
     """
 
     def __init__(self, command_channel_url=None):
+        super().__init__()
         self.stopping = False
         if command_channel_url is None:
             command_channel_url = dispatcher_env_vars.NNI_TUNER_COMMAND_CHANNEL
diff --git a/nni/tuner.py b/nni/tuner.py
index c94e68043a..87b168db65 100644
--- a/nni/tuner.py
+++ b/nni/tuner.py
@@ -219,14 +219,6 @@ def import_data(self, data: list[TrialRecord]) -> None:
         # data: a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
         pass
 
-    def import_customized_data(self, data: list[TrialRecord]) -> None:
-        """
-        Internal API under revising, not recommended for end users.
-        """
-        # Import resume data for avoiding duplications
-        # data: a list of dictionarys, each of which has at least two keys, 'parameter_id' and 'parameters'
-        pass
-
     def _on_exit(self) -> None:
         pass
 

From ca4d86d994d730848213ea4eb95333eaa9e3b9bc Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 5 Sep 2022 21:37:52 +0800
Subject: [PATCH 70/77] update

---
 nni/algorithms/hpo/bohb_advisor/bohb_advisor.py | 2 +-
 nni/algorithms/hpo/hyperband_advisor.py         | 2 +-
 nni/runtime/msg_dispatcher.py                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
index 56adad67f2..4904247aa4 100644
--- a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
+++ b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
@@ -648,7 +648,7 @@ def handle_trial_end(self, data):
             event: the job's state
             hyper_params: the hyperparameters (a string) generated and returned by tuner
         """
-        if self.is_created_in_previous_exp(data['parameter_id']):
+        if self.is_created_in_previous_exp(nni.load(data['hyper_params'])['parameter_id']):
             # The end of the recovered trial is ignored
             return
         logger.debug('Tuner handle trial end, result is %s', data)
diff --git a/nni/algorithms/hpo/hyperband_advisor.py b/nni/algorithms/hpo/hyperband_advisor.py
index f60273fc27..22766c7b2f 100644
--- a/nni/algorithms/hpo/hyperband_advisor.py
+++ b/nni/algorithms/hpo/hyperband_advisor.py
@@ -521,7 +521,7 @@ def handle_trial_end(self, data):
             event: the job's state
             hyper_params: the hyperparameters (a string) generated and returned by tuner
         """
-        if self.is_created_in_previous_exp(data['parameter_id']):
+        if self.is_created_in_previous_exp(nni.load(data['hyper_params'])['parameter_id']):
             # The end of the recovered trial is ignored
             return
         hyper_params = nni.load(data['hyper_params'])
diff --git a/nni/runtime/msg_dispatcher.py b/nni/runtime/msg_dispatcher.py
index dd62c1e9a1..50c9188d11 100644
--- a/nni/runtime/msg_dispatcher.py
+++ b/nni/runtime/msg_dispatcher.py
@@ -168,7 +168,7 @@ def handle_trial_end(self, data):
              - event: the job's state
              - hyper_params: the hyperparameters generated and returned by tuner
         """
-        if self.is_created_in_previous_exp(data['parameter_id']):
+        if self.is_created_in_previous_exp(load(data['hyper_params'])['parameter_id']):
             # The end of the recovered trial is ignored
             return
         trial_job_id = data['trial_job_id']

From eadfeb9d0da3fab7b0dc86d235c3278fff55b5ea Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Mon, 5 Sep 2022 21:41:24 +0800
Subject: [PATCH 71/77] minor

---
 nni/algorithms/hpo/bohb_advisor/bohb_advisor.py | 4 ++--
 nni/algorithms/hpo/hyperband_advisor.py         | 4 ++--
 nni/runtime/msg_dispatcher.py                   | 5 +++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
index 4904247aa4..847c9e1a01 100644
--- a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
+++ b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
@@ -648,11 +648,11 @@ def handle_trial_end(self, data):
             event: the job's state
             hyper_params: the hyperparameters (a string) generated and returned by tuner
         """
-        if self.is_created_in_previous_exp(nni.load(data['hyper_params'])['parameter_id']):
+        hyper_params = nni.load(data['hyper_params'])
+        if self.is_created_in_previous_exp(hyper_params['parameter_id']):
             # The end of the recovered trial is ignored
             return
         logger.debug('Tuner handle trial end, result is %s', data)
-        hyper_params = nni.load(data['hyper_params'])
         self._handle_trial_end(hyper_params['parameter_id'])
         if data['trial_job_id'] in self.job_id_para_id_map:
             del self.job_id_para_id_map[data['trial_job_id']]
diff --git a/nni/algorithms/hpo/hyperband_advisor.py b/nni/algorithms/hpo/hyperband_advisor.py
index 22766c7b2f..d5c4db05be 100644
--- a/nni/algorithms/hpo/hyperband_advisor.py
+++ b/nni/algorithms/hpo/hyperband_advisor.py
@@ -521,10 +521,10 @@ def handle_trial_end(self, data):
             event: the job's state
             hyper_params: the hyperparameters (a string) generated and returned by tuner
         """
-        if self.is_created_in_previous_exp(nni.load(data['hyper_params'])['parameter_id']):
+        hyper_params = nni.load(data['hyper_params'])
+        if self.is_created_in_previous_exp(hyper_params['parameter_id']):
             # The end of the recovered trial is ignored
             return
-        hyper_params = nni.load(data['hyper_params'])
         self._handle_trial_end(hyper_params['parameter_id'])
         if data['trial_job_id'] in self.job_id_para_id_map:
             del self.job_id_para_id_map[data['trial_job_id']]
diff --git a/nni/runtime/msg_dispatcher.py b/nni/runtime/msg_dispatcher.py
index 50c9188d11..42ba4c9de8 100644
--- a/nni/runtime/msg_dispatcher.py
+++ b/nni/runtime/msg_dispatcher.py
@@ -168,7 +168,8 @@ def handle_trial_end(self, data):
              - event: the job's state
              - hyper_params: the hyperparameters generated and returned by tuner
         """
-        if self.is_created_in_previous_exp(load(data['hyper_params'])['parameter_id']):
+        id_ = load(data['hyper_params'])['parameter_id']
+        if self.is_created_in_previous_exp(id_):
             # The end of the recovered trial is ignored
             return
         trial_job_id = data['trial_job_id']
@@ -178,7 +179,7 @@ def handle_trial_end(self, data):
             if self.assessor is not None:
                 self.assessor.trial_end(trial_job_id, data['event'] == 'SUCCEEDED')
         if self.tuner is not None:
-            self.tuner.trial_end(load(data['hyper_params'])['parameter_id'], data['event'] == 'SUCCEEDED')
+            self.tuner.trial_end(id_, data['event'] == 'SUCCEEDED')
 
     def _handle_final_metric_data(self, data):
         """Call tuner to process final results

From 2439a570476d70d8e4dfcb90deb17790d0cb6223 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 6 Sep 2022 08:01:47 +0800
Subject: [PATCH 72/77] fix pylint

---
 nni/algorithms/hpo/hyperband_advisor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nni/algorithms/hpo/hyperband_advisor.py b/nni/algorithms/hpo/hyperband_advisor.py
index d5c4db05be..cd53a24123 100644
--- a/nni/algorithms/hpo/hyperband_advisor.py
+++ b/nni/algorithms/hpo/hyperband_advisor.py
@@ -542,7 +542,7 @@ def handle_report_metric_data(self, data):
             Data type not supported
         """
         if self.is_created_in_previous_exp(data['parameter_id']):
-            # do not support recovering the algorithm state 
+            # do not support recovering the algorithm state
             return
         if 'value' in data:
             data['value'] = nni.load(data['value'])

From c17256d4b8706e5d22526d1ed53f27a58815958d Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 6 Sep 2022 08:38:48 +0800
Subject: [PATCH 73/77] fix bug

---
 nni/nas/execution/common/integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py
index 7f34021ad9..2990237e09 100644
--- a/nni/nas/execution/common/integration.py
+++ b/nni/nas/execution/common/integration.py
@@ -209,6 +209,7 @@ def mark_experiment_as_ending(self):
         self.send(CommandType.NoMoreTrialJobs, '')
 
     def handle_request_trial_jobs(self, num_trials):
+        self._advisor_initialized = True
         _logger.debug('Request trial jobs: %s', num_trials)
         self.invoke_callback('request_trial_jobs', num_trials)
 
@@ -258,4 +259,3 @@ def handle_import_data(self, data):
     def handle_add_customized_trial(self, data):
         previous_max_param_id = self.recover_parameter_id(data)
         self.parameters_count = previous_max_param_id
-        self._advisor_initialized = True

From 7d905ec9dc0f05c09818e8b5d5ddc662cbda03b7 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 6 Sep 2022 08:51:50 +0800
Subject: [PATCH 74/77] resolve comments

---
 nni/nas/execution/common/integration.py     | 9 +++++----
 nni/nas/execution/common/integration_api.py | 9 +--------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py
index 2990237e09..24b8f1f44f 100644
--- a/nni/nas/execution/common/integration.py
+++ b/nni/nas/execution/common/integration.py
@@ -4,6 +4,7 @@
 __all__ = ['RetiariiAdvisor']
 
 import logging
+import time
 import os
 from typing import Any, Callable, Optional, Dict, List, Tuple
 
@@ -67,10 +68,6 @@ def __init__(self, url: str):
         # this is for waiting the to-be-recovered trials from nnimanager
         self._advisor_initialized = False
 
-    @property
-    def initialized(self):
-        return self._advisor_initialized
-
     def register_callbacks(self, callbacks: Dict[str, Callable[..., None]]):
         """
         Register callbacks for NNI backend.
@@ -172,6 +169,10 @@ def send_trial(self, parameters, placement_constraint=None):
             Parameter ID that is assigned to this parameter,
             which will be used for identification in future.
         """
+        while not self._advisor_initialized:
+            _logger.info('Wait for RetiariiAdvisor to be initialized...')
+            time.sleep(0.5)
+
         self.parameters_count += 1
         if placement_constraint is None:
             placement_constraint = {
diff --git a/nni/nas/execution/common/integration_api.py b/nni/nas/execution/common/integration_api.py
index 37c381bf6b..58e5e966a6 100644
--- a/nni/nas/execution/common/integration_api.py
+++ b/nni/nas/execution/common/integration_api.py
@@ -6,15 +6,12 @@
     '_advisor'  # FIXME: hack to make it importable for tests
 ]
 
-import logging
-import time
 import warnings
 from typing import NewType, Any
 
 import nni
 from nni.common.version import version_check
 
-_logger = logging.getLogger(__name__)
 
 # NOTE: this is only for passing flake8, we cannot import RetiariiAdvisor
 # because it would induce cycled import
@@ -45,11 +42,7 @@ def send_trial(parameters: dict, placement_constraint=None) -> int:
     Send a new trial. Executed on tuner end.
     Return a ID that is the unique identifier for this trial.
     """
-    advisor = get_advisor()
-    while not advisor.initialized:
-        _logger.info('Wait for RetiariiAdvisor to be initialized...')
-        time.sleep(0.5)
-    return advisor.send_trial(parameters, placement_constraint)
+    return get_advisor().send_trial(parameters, placement_constraint)
 
 def receive_trial_parameters() -> dict:
     """

From e679b4aaae83bc1a79c85257faec88533b101e43 Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 6 Sep 2022 13:26:05 +0800
Subject: [PATCH 75/77] quick fix

---
 test/ut/nas/test_engine.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/ut/nas/test_engine.py b/test/ut/nas/test_engine.py
index 2fb7949405..d91da0d00a 100644
--- a/test/ut/nas/test_engine.py
+++ b/test/ut/nas/test_engine.py
@@ -27,6 +27,7 @@ def test_base_execution_engine(self):
         nni.retiarii.integration_api._advisor = None
         nni.retiarii.execution.api._execution_engine = None
         advisor = RetiariiAdvisor('ws://_unittest_placeholder_')
+        advisor._advisor_initialized = True
         advisor._channel = LegacyCommandChannel()
         advisor.default_worker.start()
         advisor.assessor_worker.start()
@@ -44,6 +45,7 @@ def test_py_execution_engine(self):
         nni.retiarii.integration_api._advisor = None
         nni.retiarii.execution.api._execution_engine = None
         advisor = RetiariiAdvisor('ws://_unittest_placeholder_')
+        advisor._advisor_initialized = True
         advisor._channel = LegacyCommandChannel()
         advisor.default_worker.start()
         advisor.assessor_worker.start()

From b115deb92e236dedd655815bda4998dae984403f Mon Sep 17 00:00:00 2001
From: quzha <Quanlu.Zhang@microsoft.com>
Date: Tue, 6 Sep 2022 14:49:01 +0800
Subject: [PATCH 76/77] fix incomplete test data

---
 test/ut/sdk/test_assessor.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/ut/sdk/test_assessor.py b/test/ut/sdk/test_assessor.py
index 48c2c03324..03af5149ab 100644
--- a/test/ut/sdk/test_assessor.py
+++ b/test/ut/sdk/test_assessor.py
@@ -48,11 +48,11 @@ class AssessorTestCase(TestCase):
     def test_assessor(self):
         pass
         _reverse_io()
-        send(CommandType.ReportMetricData, '{"trial_job_id":"A","type":"PERIODICAL","sequence":0,"value":"2"}')
-        send(CommandType.ReportMetricData, '{"trial_job_id":"B","type":"PERIODICAL","sequence":0,"value":"2"}')
-        send(CommandType.ReportMetricData, '{"trial_job_id":"A","type":"PERIODICAL","sequence":1,"value":"3"}')
-        send(CommandType.TrialEnd, '{"trial_job_id":"A","event":"SYS_CANCELED"}')
-        send(CommandType.TrialEnd, '{"trial_job_id":"B","event":"SUCCEEDED"}')
+        send(CommandType.ReportMetricData, '{"parameter_id": 0,"trial_job_id":"A","type":"PERIODICAL","sequence":0,"value":"2"}')
+        send(CommandType.ReportMetricData, '{"parameter_id": 1,"trial_job_id":"B","type":"PERIODICAL","sequence":0,"value":"2"}')
+        send(CommandType.ReportMetricData, '{"parameter_id": 0,"trial_job_id":"A","type":"PERIODICAL","sequence":1,"value":"3"}')
+        send(CommandType.TrialEnd, '{"trial_job_id":"A","event":"SYS_CANCELED","hyper_params":"{\\"parameter_id\\": 0}"}')
+        send(CommandType.TrialEnd, '{"trial_job_id":"B","event":"SUCCEEDED","hyper_params":"{\\"parameter_id\\": 1}"}')
         send(CommandType.NewTrialJob, 'null')
         _restore_io()
 

From ef10426ded0ad03c9c9508be4a5ebe33a1be0716 Mon Sep 17 00:00:00 2001
From: QuanluZhang <Quanlu.Zhang@microsoft.com>
Date: Tue, 4 Oct 2022 09:59:03 +0000
Subject: [PATCH 77/77] fix test of cgo engine

---
 test/algo/nas/test_cgo_engine.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/algo/nas/test_cgo_engine.py b/test/algo/nas/test_cgo_engine.py
index 1da23cb33c..d5e089f0d1 100644
--- a/test/algo/nas/test_cgo_engine.py
+++ b/test/algo/nas/test_cgo_engine.py
@@ -319,6 +319,9 @@ def test_submit_models(self):
         advisor._channel = protocol.LegacyCommandChannel()
         advisor.default_worker.start()
         advisor.assessor_worker.start()
+        # this is because RetiariiAdvisor only works after `_advisor_initialized` becomes True.
+        # normally it becomes true when `handle_request_trial_jobs` is invoked
+        advisor._advisor_initialized = True
 
         remote = RemoteConfig(machine_list=[])
         remote.machine_list.append(RemoteMachineConfig(host='test', gpu_indices=[0,1,2,3]))