From e766a22d82ff1200be6ab94802db204021a7877e Mon Sep 17 00:00:00 2001 From: quzha Date: Thu, 5 May 2022 09:28:08 +0800 Subject: [PATCH 01/77] update --- nni/retiarii/experiment/pytorch.py | 66 ++++++++++-------------------- 1 file changed, 22 insertions(+), 44 deletions(-) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 5d1ba969cb..0d81f3ffc8 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -11,7 +11,7 @@ from pathlib import Path from subprocess import Popen from threading import Thread -from typing import Any, List, Optional, Union, cast +from typing import Any, List, Optional, Union, cast, overload import colorama import psutil @@ -396,32 +396,29 @@ def _construct_devices(self): def _create_dispatcher(self): return self._dispatcher + @overload def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debug: bool = False) -> None: + ... + def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None: """ Run the experiment. This function will block until experiment finish or error. """ - if isinstance(self.evaluator, BaseOneShotTrainer): - # TODO: will throw a deprecation warning soon - # warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. ' - # 'We will try to convert this trainer to our new implementation to run the algorithm. ' - # 'In case you want to stick to the old implementation, ' - # 'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning) - self.evaluator.fit() - - if config is None: - warnings.warn('config = None is deprecate in future. If you are running a one-shot experiment, ' - 'please consider creating a config and set execution engine to `oneshot`.', DeprecationWarning) - config = RetiariiExeConfig() - config.execution_engine = 'oneshot' - - if config.execution_engine == 'oneshot': + if not isinstance(port, int): + assert port is None or isinstance(port, RetiariiExeConfig) + warnings.warn('Passing `config` in run() is deprecated.') + if port is None: + config = RetiariiExeConfig() + config.execution_engine = 'oneshot' + self.config = config + else: + self.config = port # for backward compatibility, will remove in future release + + if self.config.execution_engine == 'oneshot': base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) self.strategy.run(base_model_ir, self.applied_mutators) else: - assert config is not None, 'You are using classic search mode, config cannot be None!' - self.config = config - self.start(port, debug) + super().run(port, wait_completion, debug) def _check_exp_status(self) -> bool: """ @@ -453,40 +450,21 @@ def stop(self) -> None: """ Stop background experiment. """ - _logger.info('Stopping experiment, please wait...') - atexit.unregister(self.stop) - + _logger.info('To stop experiment...') # stop strategy first if self._dispatcher_thread is not None: self._dispatcher.stopping = True self._dispatcher_thread.join(timeout=1) - - if self.id is not None: - nni.runtime.log.stop_experiment_log(self.id) - if self._proc is not None: - try: - # this if is to deal with the situation that - # nnimanager is cleaned up by ctrl+c first - if self._proc.poll() is None: - rest.delete(self.port, '/experiment') - except Exception as e: - _logger.exception(e) - _logger.warning('Cannot gracefully stop experiment, killing NNI process...') - kill_command(self._proc.pid) - - if self._pipe is not None: - self._pipe.close() - - self.id = cast(str, None) - self.port = cast(int, None) - self._proc = None - self._pipe = None + self._dispatcher = cast(RetiariiAdvisor, None) self._dispatcher_thread = None - _logger.info('Experiment stopped') + + super().stop() def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', formatter: str = 'dict') -> Any: """ + TODO: the base class may also need this method + Export several top performing models. For one-shot algorithms, only top-1 is supported. For others, ``optimize_mode`` and ``formatter`` are From e0749678532ac41ab017188193e6bd6a16b213c2 Mon Sep 17 00:00:00 2001 From: quzha Date: Thu, 5 May 2022 09:54:48 +0800 Subject: [PATCH 02/77] update --- nni/retiarii/experiment/pytorch.py | 58 +----------------------------- 1 file changed, 1 insertion(+), 57 deletions(-) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 0d873b8ae0..8718a5bd6d 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -316,9 +316,7 @@ def start(self, port: int = 8080, debug: bool = False) -> None: debug Whether to start in debug mode. """ - atexit.register(self.stop) - - self.config = self.config.canonical_copy() + super().start(port, debug) # we will probably need a execution engine factory to make this clean and elegant if self.config.execution_engine == 'base': @@ -345,43 +343,15 @@ def start(self, port: int = 8080, debug: bool = False) -> None: raise ValueError(f'Unsupported engine type: {self.config.execution_engine}') set_execution_engine(engine) - self.id = management.generate_experiment_id() - - if self.config.experiment_working_directory is not None: - log_dir = Path(self.config.experiment_working_directory, self.id, 'log') - else: - log_dir = Path.home() / f'nni-experiments/{self.id}/log' - nni.runtime.log.start_experiment_log(self.id, log_dir, debug) - - ws_url = f'ws://localhost:{port}/tuner' - self._proc = launcher.start_experiment('create', self.id, self.config, port, debug, # type: ignore - RunMode.Background, None, ws_url, ['retiarii']) - assert self._proc is not None - connect_websocket(ws_url) - - self.port = port # port will be None if start up failed - # dispatcher must be launched after pipe initialized # the logic to launch dispatcher in background should be refactored into dispatcher api self._dispatcher = self._create_dispatcher() self._dispatcher_thread = Thread(target=self._dispatcher.run) self._dispatcher_thread.start() - ips = [self.config.nni_manager_ip] - for interfaces in psutil.net_if_addrs().values(): - for interface in interfaces: - if interface.family == socket.AF_INET: - ips.append(interface.address) - ips = [f'http://{ip}:{port}' for ip in ips if ip] - msg = 'Web UI URLs: ' + colorama.Fore.CYAN + ' '.join(ips) + colorama.Style.RESET_ALL - _logger.info(msg) - - exp_status_checker = Thread(target=self._check_exp_status) - exp_status_checker.start() self._start_strategy() # TODO: the experiment should be completed, when strategy exits and there is no running job _logger.info('Waiting for experiment to become DONE (you can ctrl+c if there is no running trial jobs)...') - exp_status_checker.join() def _construct_devices(self): devices = [] @@ -421,32 +391,6 @@ def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = Fals else: super().run(port, wait_completion, debug) - def _check_exp_status(self) -> bool: - """ - Run the experiment. - This function will block until experiment finish or error. - Return `True` when experiment done; or return `False` when experiment failed. - """ - assert self._proc is not None - try: - while True: - time.sleep(10) - # this if is to deal with the situation that - # nnimanager is cleaned up by ctrl+c first - if self._proc.poll() is None: - status = self.get_status() - else: - return False - if status == 'DONE' or status == 'STOPPED': - return True - if status == 'ERROR': - return False - except KeyboardInterrupt: - _logger.warning('KeyboardInterrupt detected') - finally: - self.stop() - raise RuntimeError('Check experiment status failed.') - def stop(self) -> None: """ Stop background experiment. From 1f4eeeaf6cfba9e91d3a41dd7c442fa2cab6ecbb Mon Sep 17 00:00:00 2001 From: quzha Date: Thu, 5 May 2022 19:48:44 +0800 Subject: [PATCH 03/77] update --- nni/experiment/experiment.py | 49 ++++++++------- nni/retiarii/experiment/pytorch.py | 97 +++++++++++++++++++----------- 2 files changed, 91 insertions(+), 55 deletions(-) diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index f514e4bdea..5cd8052369 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -87,26 +87,13 @@ def __init__(self, config_or_platform: ExperimentConfig | str | list[str] | None else: self.config = config_or_platform - def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None: - """ - Start the experiment in background. - - This method will raise exception on failure. - If it returns, the experiment should have been successfully started. - - Parameters - ---------- - port - The port of web UI. - debug - Whether to start in debug mode. - """ + def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig: assert self.config is not None if run_mode is not RunMode.Detach: atexit.register(self.stop) config = self.config.canonical_copy() - if config.use_annotation: + if hasattr(config, "use_annotation") and config.use_annotation: # will be refactored raise RuntimeError('NNI annotation is not supported by Python experiment API.') if config.experiment_working_directory is not None: @@ -114,13 +101,10 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo else: # this should never happen in latest version, keep it until v2.7 for potential compatibility log_dir = Path.home() / f'nni-experiments/{self.id}/log' nni.runtime.log.start_experiment_log(self.id, log_dir, debug) + return config - self._proc = launcher.start_experiment(self._action, self.id, config, port, debug, run_mode, self.url_prefix) - assert self._proc is not None - - self.port = port # port will be None if start up failed - - ips = [config.nni_manager_ip] + def _start_end(self, port: int, nni_manager_ip: str) -> None: + ips = [nni_manager_ip] for interfaces in psutil.net_if_addrs().values(): for interface in interfaces: if interface.family == socket.AF_INET: @@ -129,6 +113,29 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo msg = 'Web portal URLs: ' + colorama.Fore.CYAN + ' '.join(ips) + colorama.Style.RESET_ALL _logger.info(msg) + def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None: + """ + Start the experiment in background. + + This method will raise exception on failure. + If it returns, the experiment should have been successfully started. + + Parameters + ---------- + port + The port of web UI. + debug + Whether to start in debug mode. + """ + config = self._start_begin(debug, run_mode) + + self._proc = launcher.start_experiment(self._action, self.id, config, port, debug, run_mode, self.url_prefix) + assert self._proc is not None + + self.port = port # port will be None if start up failed + + self._start_end(port, config.nni_manager_ip) + def stop(self) -> None: """ Stop the experiment. diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 8718a5bd6d..620573e89c 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -1,11 +1,10 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import atexit +from __future__ import annotations + import logging import os -import socket -import time import warnings from dataclasses import dataclass from pathlib import Path @@ -14,24 +13,22 @@ from typing import Any, List, Optional, Union, cast, overload import colorama -import psutil import torch import torch.nn as nn -import nni.runtime.log from nni.common.device import GPUDevice -from nni.experiment import Experiment, RunMode, launcher, management, rest +from nni.experiment import Experiment, RunMode from nni.experiment.config import utils from nni.experiment.config.base import ConfigBase from nni.experiment.config.training_service import TrainingServiceConfig from nni.experiment.config.training_services import RemoteConfig from nni.runtime.protocol import connect_websocket -from nni.tools.nnictl.command_utils import kill_command from ..codegen import model_to_pytorch_script from ..converter import convert_to_graph from ..converter.graph_gen import GraphConverterWithShape from ..execution import list_models, set_execution_engine from ..execution.utils import get_mutation_dict +from ..execution.interface import AbstractExecutionEngine from ..graph import Evaluator from ..integration import RetiariiAdvisor from ..mutator import Mutator @@ -304,20 +301,7 @@ def _start_strategy(self): # TODO: find out a proper way to show no more trial message on WebUI # self._dispatcher.mark_experiment_as_ending() - def start(self, port: int = 8080, debug: bool = False) -> None: - """ - Start the experiment in background. - This method will raise exception on failure. - If it returns, the experiment should have been successfully started. - Parameters - ---------- - port - The port of web UI. - debug - Whether to start in debug mode. - """ - super().start(port, debug) - + def _create_execution_engine(self) -> AbstractExecutionEngine: # we will probably need a execution engine factory to make this clean and elegant if self.config.execution_engine == 'base': from ..execution.base import BaseExecutionEngine @@ -341,6 +325,32 @@ def start(self, port: int = 8080, debug: bool = False) -> None: engine = BenchmarkExecutionEngine(self.config.benchmark) else: raise ValueError(f'Unsupported engine type: {self.config.execution_engine}') + return engine + + def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None: + """ + Start the experiment in background. + This method will raise exception on failure. + If it returns, the experiment should have been successfully started. + Parameters + ---------- + port + The port of web UI. + debug + Whether to start in debug mode. + """ + config = self._start_begin(debug, run_mode) + + ws_url = f'ws://localhost:{port}/tuner' + self._proc = launcher.start_experiment('create', self.id, config, port, debug, # type: ignore + RunMode.Background, None, ws_url, ['retiarii']) + assert self._proc is not None + connect_websocket(ws_url) + self.port = port # port will be None if start up failed + + self._start_end(port, config.nni_manager_ip) + + engine = self._create_execution_engine() set_execution_engine(engine) # dispatcher must be launched after pipe initialized @@ -367,29 +377,25 @@ def _construct_devices(self): def _create_dispatcher(self): return self._dispatcher - @overload def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debug: bool = False) -> None: - ... - def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None: """ Run the experiment. This function will block until experiment finish or error. """ - if not isinstance(port, int): - assert port is None or isinstance(port, RetiariiExeConfig) - warnings.warn('Passing `config` in run() is deprecated.') - if port is None: - config = RetiariiExeConfig() - config.execution_engine = 'oneshot' - self.config = config - else: - self.config = port # for backward compatibility, will remove in future release + assert port is None or isinstance(port, RetiariiExeConfig) + warnings.warn('Passing `config` in run() is deprecated.') + if port is None: + config = RetiariiExeConfig() + config.execution_engine = 'oneshot' + self.config = config + else: + self.config = port # for backward compatibility, will remove in future release if self.config.execution_engine == 'oneshot': base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) self.strategy.run(base_model_ir, self.applied_mutators) else: - super().run(port, wait_completion, debug) + super().run(port, True, debug) def stop(self) -> None: """ @@ -451,3 +457,26 @@ def retrain_model(self, model): this function retrains the exported model, and test it to output test accuracy """ raise NotImplementedError + + +class NasExperiment(RetiariiExperiment): + """ + This class is only a new interface wrapper. + """ + def __init__(self, model: nn.Module, + evaluator: Union[BaseOneShotTrainer, Evaluator], + strategy: BaseStrategy, + config_or_platform: ExperimentConfig | str | list[str] | None = 'local', + mutators: List[Mutator] = cast(List[Mutator], None)): + ... + + def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None: + """ + Run the experiment. + This function will block until experiment finish or error. + """ + if self.config.execution_engine == 'oneshot': + base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) + self.strategy.run(base_model_ir, self.applied_mutators) + else: + super().run(port, wait_completion, debug) \ No newline at end of file From b9c788b819d29f63996508f5819d7ac4bbe0d1ad Mon Sep 17 00:00:00 2001 From: quzha Date: Fri, 6 May 2022 19:14:18 +0800 Subject: [PATCH 04/77] update --- nni/experiment/config/base.py | 2 + nni/experiment/config/experiment_config.py | 9 +- nni/experiment/experiment.py | 7 +- nni/retiarii/execution/cgo_engine.py | 19 ++- nni/retiarii/experiment/__init__.py | 2 + nni/retiarii/experiment/config/__init__.py | 5 + .../experiment/config/engine_config.py | 43 +++++ .../experiment/config/experiment_config.py | 67 ++++++++ nni/retiarii/experiment/pytorch.py | 157 +++++------------- 9 files changed, 187 insertions(+), 124 deletions(-) create mode 100644 nni/retiarii/experiment/config/__init__.py create mode 100644 nni/retiarii/experiment/config/engine_config.py create mode 100644 nni/retiarii/experiment/config/experiment_config.py diff --git a/nni/experiment/config/base.py b/nni/experiment/config/base.py index f3d44e063f..a2de758315 100644 --- a/nni/experiment/config/base.py +++ b/nni/experiment/config/base.py @@ -158,7 +158,9 @@ def canonical_copy(self): A deep copy. """ canon = copy.deepcopy(self) + print(type(canon)) canon._canonicalize([]) + print(type(canon)) canon._validate_canonical() return canon diff --git a/nni/experiment/config/experiment_config.py b/nni/experiment/config/experiment_config.py index 20216d7c21..e8791c0bf7 100644 --- a/nni/experiment/config/experiment_config.py +++ b/nni/experiment/config/experiment_config.py @@ -141,7 +141,7 @@ def _canonicalize(self, _parents): msg = f'nni_manager_ip is not set, please make sure {ip} is accessible from training machines' logging.getLogger('nni.experiment.config').warning(msg) - def _validate_canonical(self): + def _validate_canonical(self, validate_tuner: bool = True): # FIXME: remove validate_tuner super()._validate_canonical() space_cnt = (self.search_space is not None) + (self.search_space_file is not None) @@ -164,10 +164,11 @@ def _validate_canonical(self): # currently I have only seen one issue of this kind #Path(self.experiment_working_directory).mkdir(parents=True, exist_ok=True) - utils.validate_gpu_indices(self.tuner_gpu_indices) + if validate_tuner: + utils.validate_gpu_indices(self.tuner_gpu_indices) - if self.tuner is None: - raise ValueError('ExperimentConfig: tuner must be set') + if self.tuner is None: + raise ValueError('ExperimentConfig: tuner must be set') def _load_search_space_file(search_space_path): # FIXME diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index 5cd8052369..524f169b8c 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -10,7 +10,7 @@ import socket from subprocess import Popen import time -from typing import Any +from typing import Any, Optional import colorama import psutil @@ -92,8 +92,9 @@ def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig: if run_mode is not RunMode.Detach: atexit.register(self.stop) + print(type(self.config)) config = self.config.canonical_copy() - if hasattr(config, "use_annotation") and config.use_annotation: # will be refactored + if hasattr(config, "use_annotation") and config.use_annotation: #TODO: will be refactored raise RuntimeError('NNI annotation is not supported by Python experiment API.') if config.experiment_working_directory is not None: @@ -103,7 +104,7 @@ def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig: nni.runtime.log.start_experiment_log(self.id, log_dir, debug) return config - def _start_end(self, port: int, nni_manager_ip: str) -> None: + def _start_end(self, port: int, nni_manager_ip: Optional[str]) -> None: ips = [nni_manager_ip] for interfaces in psutil.net_if_addrs().values(): for interface in interfaces: diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py index 4ba11987a1..509708cf54 100644 --- a/nni/retiarii/execution/cgo_engine.py +++ b/nni/retiarii/execution/cgo_engine.py @@ -7,10 +7,11 @@ import string import time import threading -from typing import Iterable, List, Dict, Tuple +from typing import Iterable, List, Dict, Tuple, cast from dataclasses import dataclass from nni.common.device import GPUDevice, Device +from nni.experiment.config.training_services import RemoteConfig from .interface import AbstractExecutionEngine, AbstractGraphListener, WorkerInfo from .. import codegen, utils from ..graph import Model, ModelStatus, MetricData, Node @@ -31,7 +32,6 @@ class TrialSubmission: placement: Dict[Node, Device] grouped_models: List[Model] - class CGOExecutionEngine(AbstractExecutionEngine): """ The execution engine with Cross-Graph Optimization (CGO). @@ -50,7 +50,7 @@ class CGOExecutionEngine(AbstractExecutionEngine): The trials within one batch could apply cross-graph optimization. """ - def __init__(self, devices: List[Device] = None, + def __init__(self, training_service, max_concurrency: int = None, batch_waiting_time: int = 60, ) -> None: @@ -59,6 +59,8 @@ def __init__(self, devices: List[Device] = None, self.logical_plan_counter = 0 self.available_devices: List[Device] = [] self.max_concurrency: int = max_concurrency + + devices = self._construct_devices(training_service) for device in devices: self.available_devices.append(device) self.all_devices = self.available_devices.copy() @@ -88,6 +90,17 @@ def __init__(self, devices: List[Device] = None, self._consumer_thread = threading.Thread(target=self._consume_models) self._consumer_thread.start() + def _construct_devices(self, training_service): + devices = [] + if hasattr(training_service, 'machine_list'): + for machine in cast(RemoteConfig, training_service).machine_list: + assert machine.gpu_indices is not None, \ + 'gpu_indices must be set in RemoteMachineConfig for CGO execution engine' + assert isinstance(machine.gpu_indices, list), 'gpu_indices must be a list' + for gpu_idx in machine.gpu_indices: + devices.append(GPUDevice(machine.host, gpu_idx)) + return devices + def join(self): self._stopped = True self._consumer_thread.join() diff --git a/nni/retiarii/experiment/__init__.py b/nni/retiarii/experiment/__init__.py index e69de29bb2..0eca6426d9 100644 --- a/nni/retiarii/experiment/__init__.py +++ b/nni/retiarii/experiment/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. \ No newline at end of file diff --git a/nni/retiarii/experiment/config/__init__.py b/nni/retiarii/experiment/config/__init__.py new file mode 100644 index 0000000000..38bc427477 --- /dev/null +++ b/nni/retiarii/experiment/config/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from .experiment_config import * +from .engine_config import * \ No newline at end of file diff --git a/nni/retiarii/experiment/config/engine_config.py b/nni/retiarii/experiment/config/engine_config.py new file mode 100644 index 0000000000..042e227012 --- /dev/null +++ b/nni/retiarii/experiment/config/engine_config.py @@ -0,0 +1,43 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from dataclasses import dataclass +from typing import Optional, List + +from nni.experiment.config.base import ConfigBase + +__all__ = ['ExecutionEngineConfig', 'BaseEngineConfig', 'OneshotEngineConfig', + 'PyEngineConfig', 'CgoEngineConfig', 'BenchmarkEngineConfig'] + +@dataclass(init=False) +class ExecutionEngineConfig(ConfigBase): + """ + """ + name: str + +@dataclass(init=False) +class PyEngineConfig(ExecutionEngineConfig): + name: str = 'py' + +@dataclass(init=False) +class OneshotEngineConfig(ExecutionEngineConfig): + name: str = 'oneshot' + +@dataclass(init=False) +class BaseEngineConfig(ExecutionEngineConfig): + name: str = 'base' + # input used in GraphConverterWithShape. Currently support shape tuple only. + dummy_input: Optional[List[int]] = None + +@dataclass(init=False) +class CgoEngineConfig(ExecutionEngineConfig): + name: str = 'cgo' + max_concurrency_cgo: Optional[int] = None + batch_waiting_time: Optional[int] = None + # input used in GraphConverterWithShape. Currently support shape tuple only. + dummy_input: Optional[List[int]] = None + +@dataclass(init=False) +class BenchmarkEngineConfig(ExecutionEngineConfig): + name: str = 'benchmark' + benchmark: Optional[str] = None \ No newline at end of file diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py new file mode 100644 index 0000000000..69d7185220 --- /dev/null +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -0,0 +1,67 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +from dataclasses import dataclass +from typing import Any, Optional, Union + +from nni.experiment.config import utils, ExperimentConfig + +from .engine_config import ExecutionEngineConfig, PyEngineConfig + +__all__ = ['RetiariiExeConfig'] + +def execution_engine_config_factory(engine_name): + # FIXME: may move this function to experiment utils in future + cls = _get_ee_config_class(engine_name) + if cls is None: + raise ValueError(f'Invalid execution engine name: {engine_name}') + return cls() + +def _get_ee_config_class(engine_name): + for cls in ExecutionEngineConfig.__subclasses__(): + if cls.name == engine_name: + return cls + return None + +@dataclass(init=False) +class RetiariiExeConfig(ExperimentConfig): + # FIXME: refactor this class to inherit from a new common base class with HPO config + search_space: Any = '' + trial_code_directory: utils.PathLike = '.' + trial_command: str = '_reserved' + + execution_engine: ExecutionEngineConfig = PyEngineConfig() + + def __init__(self, training_service_platform: Optional[str] = None, + execution_engine: Union[str, ExecutionEngineConfig] = None, #TODO: having default value or not? + **kwargs): + super().__init__(training_service_platform, **kwargs) + + if execution_engine is not None: + # the user chose to init with `config = ExperimentConfig('local')` and set fields later + # we need to create empty training service & algorithm configs to support `config.tuner.name = 'random'` + assert utils.is_missing(self.execution_engine) + if isinstance(execution_engine, str): + self.execution_engine = execution_engine_config_factory(execution_engine) + else: + self.execution_engine = execution_engine + + self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name + + def __setattr__(self, key, value): + #TODO: tuner settings can also be blocked here + fixed_attrs = {'search_space': '', + 'trial_command': '_reserved'} + if key in fixed_attrs and fixed_attrs[key] != value: + raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!') + # 'trial_code_directory' is handled differently because the path will be converted to absolute path by us + if key == 'trial_code_directory' and not (str(value) == '.' or os.path.isabs(value)): + raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!') + #if key == 'execution_engine': + # assert value in ['base', 'py', 'cgo', 'benchmark', 'oneshot'], f'The specified execution engine "{value}" is not supported.' + # self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + value + super().__setattr__(key, value) #TODO: double check whether new fields are validated + + def _validate_canonical(self): + super()._validate_canonical(False) \ No newline at end of file diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 620573e89c..919026a589 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -4,25 +4,25 @@ from __future__ import annotations import logging -import os + import warnings -from dataclasses import dataclass -from pathlib import Path from subprocess import Popen from threading import Thread -from typing import Any, List, Optional, Union, cast, overload +from typing import Any, List, Optional, Union, cast import colorama +import psutil +from typing_extensions import Literal + import torch import torch.nn as nn -from nni.common.device import GPUDevice -from nni.experiment import Experiment, RunMode -from nni.experiment.config import utils -from nni.experiment.config.base import ConfigBase -from nni.experiment.config.training_service import TrainingServiceConfig -from nni.experiment.config.training_services import RemoteConfig +from nni.retiarii.experiment.config.engine_config import * +import nni.runtime.log +from nni.experiment import Experiment, RunMode, launcher, management +from nni.experiment.config import ExperimentConfig from nni.runtime.protocol import connect_websocket +from .config import RetiariiExeConfig, OneshotEngineConfig from ..codegen import model_to_pytorch_script from ..converter import convert_to_graph from ..converter.graph_gen import GraphConverterWithShape @@ -43,79 +43,7 @@ _logger = logging.getLogger(__name__) -__all__ = ['RetiariiExeConfig', 'RetiariiExperiment'] - - -@dataclass(init=False) -class RetiariiExeConfig(ConfigBase): - experiment_name: Optional[str] = None - search_space: Any = '' # TODO: remove - trial_command: str = '_reserved' - trial_code_directory: utils.PathLike = '.' - trial_concurrency: int - trial_gpu_number: int = 0 - devices: Optional[List[Union[str, GPUDevice]]] = None - max_experiment_duration: Optional[str] = None - max_trial_number: Optional[int] = None - max_concurrency_cgo: Optional[int] = None - batch_waiting_time: Optional[int] = None - nni_manager_ip: Optional[str] = None - debug: bool = False - log_level: str = 'info' - experiment_working_directory: utils.PathLike = '~/nni-experiments' - # remove configuration of tuner/assessor/advisor - training_service: TrainingServiceConfig - execution_engine: str = 'py' - - # input used in GraphConverterWithShape. Currently support shape tuple only. - dummy_input: Optional[List[int]] = None - - # input used for benchmark engine. - benchmark: Optional[str] = None - - def __init__(self, training_service_platform: Optional[str] = None, **kwargs): - super().__init__(**kwargs) - if training_service_platform is not None: - assert 'training_service' not in kwargs - self.training_service = utils.training_service_config_factory(platform=training_service_platform) - self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry py' - - def __setattr__(self, key, value): - fixed_attrs = {'search_space': '', - 'trial_command': '_reserved'} - if key in fixed_attrs and fixed_attrs[key] != value: - raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!') - # 'trial_code_directory' is handled differently because the path will be converted to absolute path by us - if key == 'trial_code_directory' and not (str(value) == '.' or os.path.isabs(value)): - raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!') - if key == 'execution_engine': - assert value in ['base', 'py', 'cgo', 'benchmark', 'oneshot'], f'The specified execution engine "{value}" is not supported.' - self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + value - self.__dict__[key] = value - - def validate(self, initialized_tuner: bool = False) -> None: - super().validate() - - @property - def _canonical_rules(self): - return _canonical_rules - - @property - def _validation_rules(self): - return _validation_rules - - -_canonical_rules = { -} - -_validation_rules = { - 'trial_code_directory': lambda value: (Path(value).is_dir(), f'"{value}" does not exist or is not directory'), - 'trial_concurrency': lambda value: value > 0, - 'trial_gpu_number': lambda value: value >= 0, - 'max_trial_number': lambda value: value > 0, - 'log_level': lambda value: value in ["trace", "debug", "info", "warning", "error", "fatal"], - 'training_service': lambda value: (type(value) is not TrainingServiceConfig, 'cannot be abstract base class') -} +__all__ = ['RetiariiExperiment', 'NasExperiment'] def preprocess_model(base_model, evaluator, applied_mutators, full_ir=True, dummy_input=None, oneshot=False): @@ -252,6 +180,8 @@ class RetiariiExperiment(Experiment): def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, Evaluator] = cast(Evaluator, None), applied_mutators: List[Mutator] = cast(List[Mutator], None), strategy: BaseStrategy = cast(BaseStrategy, None), trainer: BaseOneShotTrainer = cast(BaseOneShotTrainer, None)): + nni.runtime.log.init_logger_for_command_line() + if trainer is not None: warnings.warn('Usage of `trainer` in RetiariiExperiment is deprecated and will be removed soon. ' 'Please consider specifying it as a positional argument, or use `evaluator`.', DeprecationWarning) @@ -260,6 +190,13 @@ def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, E if evaluator is None: raise ValueError('Evaluator should not be none.') + self.config: RetiariiExeConfig | None = None + self.id: str = management.generate_experiment_id() + self.port: int | None = None + self._proc: Popen | psutil.Process | None = None + self._action: Literal['create', 'resume', 'view'] = 'create' + self.url_prefix: str | None = None + # TODO: The current design of init interface of Retiarii experiment needs to be reviewed. self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None) self.port: Optional[int] = None @@ -289,8 +226,8 @@ def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, E def _start_strategy(self): base_model_ir, self.applied_mutators = preprocess_model( self.base_model, self.evaluator, self.applied_mutators, - full_ir=self.config.execution_engine not in ['py', 'benchmark'], - dummy_input=self.config.dummy_input + full_ir=not isinstance(self.config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)), + dummy_input=self.config.execution_engine.dummy_input if hasattr(self.config.execution_engine, 'dummy_input') else None ) _logger.info('Start strategy...') @@ -303,23 +240,22 @@ def _start_strategy(self): def _create_execution_engine(self) -> AbstractExecutionEngine: # we will probably need a execution engine factory to make this clean and elegant - if self.config.execution_engine == 'base': + if isinstance(self.config.execution_engine, BaseEngineConfig): from ..execution.base import BaseExecutionEngine engine = BaseExecutionEngine() - elif self.config.execution_engine == 'cgo': + elif isinstance(self.config.execution_engine, CgoEngineConfig): from ..execution.cgo_engine import CGOExecutionEngine assert self.config.training_service.platform == 'remote', \ "CGO execution engine currently only supports remote training service" assert self.config.batch_waiting_time is not None and self.config.max_concurrency_cgo is not None - devices = self._construct_devices() - engine = CGOExecutionEngine(devices, + engine = CGOExecutionEngine(self.config.training_service, max_concurrency=self.config.max_concurrency_cgo, batch_waiting_time=self.config.batch_waiting_time) - elif self.config.execution_engine == 'py': + elif isinstance(self.config.execution_engine, PyEngineConfig): from ..execution.python import PurePythonExecutionEngine engine = PurePythonExecutionEngine() - elif self.config.execution_engine == 'benchmark': + elif isinstance(self.config.execution_engine, BenchmarkEngineConfig): from ..execution.benchmark import BenchmarkExecutionEngine assert self.config.benchmark is not None, '"benchmark" must be set when benchmark execution engine is used.' engine = BenchmarkExecutionEngine(self.config.benchmark) @@ -363,17 +299,6 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo # TODO: the experiment should be completed, when strategy exits and there is no running job _logger.info('Waiting for experiment to become DONE (you can ctrl+c if there is no running trial jobs)...') - def _construct_devices(self): - devices = [] - if hasattr(self.config.training_service, 'machine_list'): - for machine in cast(RemoteConfig, self.config.training_service).machine_list: - assert machine.gpu_indices is not None, \ - 'gpu_indices must be set in RemoteMachineConfig for CGO execution engine' - assert isinstance(machine.gpu_indices, list), 'gpu_indices must be a list' - for gpu_idx in machine.gpu_indices: - devices.append(GPUDevice(machine.host, gpu_idx)) - return devices - def _create_dispatcher(self): return self._dispatcher @@ -382,16 +307,20 @@ def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debu Run the experiment. This function will block until experiment finish or error. """ - assert port is None or isinstance(port, RetiariiExeConfig) - warnings.warn('Passing `config` in run() is deprecated.') - if port is None: + if isinstance(self.evaluator, BaseOneShotTrainer): + # TODO: will throw a deprecation warning soon + # warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. ' + # 'We will try to convert this trainer to our new implementation to run the algorithm. ' + # 'In case you want to stick to the old implementation, ' + # 'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning) + self.evaluator.fit() + + if config is None: config = RetiariiExeConfig() - config.execution_engine = 'oneshot' - self.config = config - else: - self.config = port # for backward compatibility, will remove in future release - - if self.config.execution_engine == 'oneshot': + config.execution_engine = OneshotEngineConfig() + self.config = config + + if self.config.execution_engine.name == 'oneshot': base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) self.strategy.run(base_model_ir, self.applied_mutators) else: @@ -406,7 +335,7 @@ def stop(self) -> None: if self._dispatcher_thread is not None: self._dispatcher.stopping = True self._dispatcher_thread.join(timeout=1) - + self._dispatcher = cast(RetiariiAdvisor, None) self._dispatcher_thread = None @@ -434,7 +363,7 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for If ``dict``, the mutation history will be returned. """ if formatter == 'code': - assert self.config.execution_engine != 'py', 'You should use `dict` formatter when using Python execution engine.' + assert not isinstance(self.config.execution_engine, PyEngineConfig), 'You should use `dict` formatter when using Python execution engine.' if isinstance(self.evaluator, BaseOneShotTrainer): assert top_k == 1, 'Only support top_k is 1 for now.' return self.evaluator.export() @@ -475,7 +404,7 @@ def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = Fals Run the experiment. This function will block until experiment finish or error. """ - if self.config.execution_engine == 'oneshot': + if self.config.execution_engine.name == 'oneshot': #TODO base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) self.strategy.run(base_model_ir, self.applied_mutators) else: From 1e97e042bb3dcf0856e0c64a0ef5d045de3493ea Mon Sep 17 00:00:00 2001 From: quzha Date: Fri, 6 May 2022 19:59:00 +0800 Subject: [PATCH 05/77] update --- .../experiment/config/experiment_config.py | 2 +- nni/retiarii/experiment/pytorch.py | 45 +++++++------------ 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py index 69d7185220..4c3e57caf0 100644 --- a/nni/retiarii/experiment/config/experiment_config.py +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -30,7 +30,7 @@ class RetiariiExeConfig(ExperimentConfig): search_space: Any = '' trial_code_directory: utils.PathLike = '.' trial_command: str = '_reserved' - + # new config field for NAS execution_engine: ExecutionEngineConfig = PyEngineConfig() def __init__(self, training_service_platform: Optional[str] = None, diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 919026a589..283ae982a8 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -177,8 +177,10 @@ class RetiariiExperiment(Experiment): ... final_model = Net() """ - def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, Evaluator] = cast(Evaluator, None), - applied_mutators: List[Mutator] = cast(List[Mutator], None), strategy: BaseStrategy = cast(BaseStrategy, None), + def __init__(self, base_model: nn.Module, + evaluator: Union[BaseOneShotTrainer, Evaluator] = cast(Evaluator, None), + applied_mutators: List[Mutator] = cast(List[Mutator], None), + strategy: BaseStrategy = cast(BaseStrategy, None), trainer: BaseOneShotTrainer = cast(BaseOneShotTrainer, None)): nni.runtime.log.init_logger_for_command_line() @@ -196,8 +198,6 @@ def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, E self._proc: Popen | psutil.Process | None = None self._action: Literal['create', 'resume', 'view'] = 'create' self.url_prefix: str | None = None - - # TODO: The current design of init interface of Retiarii experiment needs to be reviewed. self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None) self.port: Optional[int] = None @@ -206,16 +206,6 @@ def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, E self.applied_mutators = applied_mutators self.strategy = strategy - from nni.retiarii.oneshot.pytorch.strategy import OneShotStrategy - if not isinstance(strategy, OneShotStrategy): - self._dispatcher = RetiariiAdvisor() - else: - self._dispatcher = cast(RetiariiAdvisor, None) - self._dispatcher_thread: Optional[Thread] = None - self._proc: Optional[Popen] = None - - self.url_prefix = None - # check for sanity if not is_model_wrapped(base_model): warnings.warn(colorama.Style.BRIGHT + colorama.Fore.RED + @@ -238,8 +228,8 @@ def _start_strategy(self): # TODO: find out a proper way to show no more trial message on WebUI # self._dispatcher.mark_experiment_as_ending() - def _create_execution_engine(self) -> AbstractExecutionEngine: - # we will probably need a execution engine factory to make this clean and elegant + def _create_execution_engine(self) -> None: + #TODO: we will probably need a execution engine factory to make this clean and elegant if isinstance(self.config.execution_engine, BaseEngineConfig): from ..execution.base import BaseExecutionEngine engine = BaseExecutionEngine() @@ -261,7 +251,7 @@ def _create_execution_engine(self) -> AbstractExecutionEngine: engine = BenchmarkExecutionEngine(self.config.benchmark) else: raise ValueError(f'Unsupported engine type: {self.config.execution_engine}') - return engine + set_execution_engine(engine) def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None: """ @@ -286,12 +276,11 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo self._start_end(port, config.nni_manager_ip) - engine = self._create_execution_engine() - set_execution_engine(engine) + self._create_execution_engine() # FIXME: engine cannot be created twice + self._dispatcher = RetiariiAdvisor() # dispatcher must be launched after pipe initialized # the logic to launch dispatcher in background should be refactored into dispatcher api - self._dispatcher = self._create_dispatcher() self._dispatcher_thread = Thread(target=self._dispatcher.run) self._dispatcher_thread.start() @@ -299,9 +288,6 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo # TODO: the experiment should be completed, when strategy exits and there is no running job _logger.info('Waiting for experiment to become DONE (you can ctrl+c if there is no running trial jobs)...') - def _create_dispatcher(self): - return self._dispatcher - def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debug: bool = False) -> None: """ Run the experiment. @@ -314,16 +300,16 @@ def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debu # 'In case you want to stick to the old implementation, ' # 'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning) self.evaluator.fit() + return if config is None: - config = RetiariiExeConfig() - config.execution_engine = OneshotEngineConfig() - self.config = config - - if self.config.execution_engine.name == 'oneshot': + self.config = RetiariiExeConfig() + self.config.execution_engine = OneshotEngineConfig() base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) + # FIXME: oneshot strategy should also be executable on training services self.strategy.run(base_model_ir, self.applied_mutators) else: + self.config = config super().run(port, True, debug) def stop(self) -> None: @@ -396,6 +382,7 @@ def __init__(self, model: nn.Module, evaluator: Union[BaseOneShotTrainer, Evaluator], strategy: BaseStrategy, config_or_platform: ExperimentConfig | str | list[str] | None = 'local', + execution_engine: Union[str, ExecutionEngineConfig] = 'py', mutators: List[Mutator] = cast(List[Mutator], None)): ... @@ -404,7 +391,7 @@ def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = Fals Run the experiment. This function will block until experiment finish or error. """ - if self.config.execution_engine.name == 'oneshot': #TODO + if isinstance(self.config.execution_engine.name, OneshotEngineConfig): base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) self.strategy.run(base_model_ir, self.applied_mutators) else: From 9e39e92a111db685891e1deb5f853a465b77e205 Mon Sep 17 00:00:00 2001 From: quzha Date: Sat, 7 May 2022 13:33:04 +0800 Subject: [PATCH 06/77] runnable --- nni/experiment/experiment.py | 31 +++++++++++++++++------------- nni/retiarii/experiment/pytorch.py | 26 ++++++++++++------------- ts/nni_manager/core/nnimanager.ts | 2 +- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index 524f169b8c..7f4402f55f 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -92,7 +92,6 @@ def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig: if run_mode is not RunMode.Detach: atexit.register(self.stop) - print(type(self.config)) config = self.config.canonical_copy() if hasattr(config, "use_annotation") and config.use_annotation: #TODO: will be refactored raise RuntimeError('NNI annotation is not supported by Python experiment API.') @@ -137,11 +136,7 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo self._start_end(port, config.nni_manager_ip) - def stop(self) -> None: - """ - Stop the experiment. - """ - _logger.info('Stopping experiment, please wait...') + def _stop(self) -> None: atexit.unregister(self.stop) nni.runtime.log.stop_experiment_log(self.id) @@ -156,8 +151,24 @@ def stop(self) -> None: self.id = None # type: ignore self.port = None self._proc = None + + def stop(self) -> None: + """ + Stop the experiment. + """ + _logger.info('Stopping experiment, please wait...') + self._stop() _logger.info('Experiment stopped') + def _wait_completion(self) -> None: + while True: + time.sleep(10) + status = self.get_status() + if status == 'DONE' or status == 'STOPPED': + return True + if status == 'ERROR': + return False + def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None: """ Run the experiment. @@ -171,13 +182,7 @@ def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = Fals self.start(port, debug) if wait_completion: try: - while True: - time.sleep(10) - status = self.get_status() - if status == 'DONE' or status == 'STOPPED': - return True - if status == 'ERROR': - return False + self._wait_completion() except KeyboardInterrupt: _logger.warning('KeyboardInterrupt detected') self.stop() diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 283ae982a8..27ad37d3e7 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -276,17 +276,13 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo self._start_end(port, config.nni_manager_ip) - self._create_execution_engine() # FIXME: engine cannot be created twice - self._dispatcher = RetiariiAdvisor() # dispatcher must be launched after pipe initialized # the logic to launch dispatcher in background should be refactored into dispatcher api self._dispatcher_thread = Thread(target=self._dispatcher.run) self._dispatcher_thread.start() - self._start_strategy() - # TODO: the experiment should be completed, when strategy exits and there is no running job - _logger.info('Waiting for experiment to become DONE (you can ctrl+c if there is no running trial jobs)...') + self._create_execution_engine() # FIXME: engine cannot be created twice def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debug: bool = False) -> None: """ @@ -310,22 +306,24 @@ def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debu self.strategy.run(base_model_ir, self.applied_mutators) else: self.config = config - super().run(port, True, debug) + self.start(port, debug) + try: + self._start_strategy() + self._wait_completion() + except KeyboardInterrupt: + _logger.warning('KeyboardInterrupt detected') + self.stop() def stop(self) -> None: """ Stop background experiment. """ - _logger.info('To stop experiment...') - # stop strategy first - if self._dispatcher_thread is not None: - self._dispatcher.stopping = True - self._dispatcher_thread.join(timeout=1) - + _logger.info('Stopping experiment, please wait...') + self._stop() + self._dispatcher_thread.join() self._dispatcher = cast(RetiariiAdvisor, None) self._dispatcher_thread = None - - super().stop() + _logger.info('Experiment stopped') def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', formatter: str = 'dict') -> Any: """ diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts index 7ad5a0130f..54a42760cc 100644 --- a/ts/nni_manager/core/nnimanager.ts +++ b/ts/nni_manager/core/nnimanager.ts @@ -303,8 +303,8 @@ class NNIManager implements Manager { } this.trainingService.removeTrialJobMetricListener(this.trialJobMetricListener); + this.dispatcher.sendCommand(TERMINATE); if (this.dispatcherPid > 0) { - this.dispatcher.sendCommand(TERMINATE); // gracefully terminate tuner and assessor here, wait at most 30 seconds. for (let i: number = 0; i < 30; i++) { if (!await isAlive(this.dispatcherPid)) { From 6ebd77430de26770d524452340866922737ec710 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 9 May 2022 19:56:57 +0800 Subject: [PATCH 07/77] update --- nni/experiment/experiment.py | 2 +- nni/retiarii/execution/base.py | 11 ++++++++--- nni/retiarii/execution/cgo_engine.py | 5 +++++ nni/retiarii/experiment/pytorch.py | 22 +++++++++++++--------- 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index 7f4402f55f..b464e3d48a 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -162,12 +162,12 @@ def stop(self) -> None: def _wait_completion(self) -> None: while True: - time.sleep(10) status = self.get_status() if status == 'DONE' or status == 'STOPPED': return True if status == 'ERROR': return False + time.sleep(10) def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None: """ diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py index d8cda6cc8a..d488ce1d4c 100644 --- a/nni/retiarii/execution/base.py +++ b/nni/retiarii/execution/base.py @@ -7,6 +7,8 @@ import string from typing import Any, Dict, Iterable, List +from nni.experiment import rest + from .interface import AbstractExecutionEngine, AbstractGraphListener from .utils import get_mutation_summary from .. import codegen, utils @@ -54,12 +56,15 @@ class BaseExecutionEngine(AbstractExecutionEngine): Resource management is implemented in this class. """ - def __init__(self) -> None: + def __init__(self, rest_port: int = None, rest_url_prefix: str = None) -> None: """ Upon initialization, advisor callbacks need to be registered. Advisor will call the callbacks when the corresponding event has been triggered. Base execution engine will get those callbacks and broadcast them to graph listener. """ + self.port = rest_port + self.url_prefix = rest_url_prefix + self._listeners: List[AbstractGraphListener] = [] # register advisor callbacks @@ -123,8 +128,8 @@ def query_available_resource(self) -> int: return self.resources def budget_exhausted(self) -> bool: - advisor = get_advisor() - return advisor.stopping + resp = rest.get(self.port, '/check-status', self.url_prefix) + return resp['status'] == 'DONE' @classmethod def pack_model_data(cls, model: Model) -> Any: diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py index 509708cf54..94459199dd 100644 --- a/nni/retiarii/execution/cgo_engine.py +++ b/nni/retiarii/execution/cgo_engine.py @@ -53,7 +53,12 @@ class CGOExecutionEngine(AbstractExecutionEngine): def __init__(self, training_service, max_concurrency: int = None, batch_waiting_time: int = 60, + rest_port: int = None, + rest_url_prefix: str = None ) -> None: + self.port = rest_port + self.url_prefix = rest_url_prefix + self._listeners: List[AbstractGraphListener] = [] self._running_models: Dict[int, Model] = dict() self.logical_plan_counter = 0 diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 27ad37d3e7..69a3ae5d6e 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -192,14 +192,12 @@ def __init__(self, base_model: nn.Module, if evaluator is None: raise ValueError('Evaluator should not be none.') - self.config: RetiariiExeConfig | None = None self.id: str = management.generate_experiment_id() self.port: int | None = None self._proc: Popen | psutil.Process | None = None self._action: Literal['create', 'resume', 'view'] = 'create' self.url_prefix: str | None = None - self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None) - self.port: Optional[int] = None + self.config: RetiariiExeConfig | None = cast(RetiariiExeConfig, None) self.base_model = base_model self.evaluator: Union[Evaluator, BaseOneShotTrainer] = evaluator @@ -213,7 +211,7 @@ def __init__(self, base_model: nn.Module, 'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL, RuntimeWarning) - def _start_strategy(self): + def _run_strategy(self): base_model_ir, self.applied_mutators = preprocess_model( self.base_model, self.evaluator, self.applied_mutators, full_ir=not isinstance(self.config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)), @@ -232,7 +230,7 @@ def _create_execution_engine(self) -> None: #TODO: we will probably need a execution engine factory to make this clean and elegant if isinstance(self.config.execution_engine, BaseEngineConfig): from ..execution.base import BaseExecutionEngine - engine = BaseExecutionEngine() + engine = BaseExecutionEngine(self.port, self.url_prefix) elif isinstance(self.config.execution_engine, CgoEngineConfig): from ..execution.cgo_engine import CGOExecutionEngine @@ -241,10 +239,12 @@ def _create_execution_engine(self) -> None: assert self.config.batch_waiting_time is not None and self.config.max_concurrency_cgo is not None engine = CGOExecutionEngine(self.config.training_service, max_concurrency=self.config.max_concurrency_cgo, - batch_waiting_time=self.config.batch_waiting_time) + batch_waiting_time=self.config.batch_waiting_time, + rest_port=self.port, + rest_url_prefix=self.url_prefix) elif isinstance(self.config.execution_engine, PyEngineConfig): from ..execution.python import PurePythonExecutionEngine - engine = PurePythonExecutionEngine() + engine = PurePythonExecutionEngine(self.port, self.url_prefix) elif isinstance(self.config.execution_engine, BenchmarkEngineConfig): from ..execution.benchmark import BenchmarkExecutionEngine assert self.config.benchmark is not None, '"benchmark" must be set when benchmark execution engine is used.' @@ -284,7 +284,9 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo self._create_execution_engine() # FIXME: engine cannot be created twice - def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debug: bool = False) -> None: + def run(self, config: Optional[RetiariiExeConfig] = None, + port: int = 8080, + debug: bool = False) -> None: """ Run the experiment. This function will block until experiment finish or error. @@ -308,11 +310,13 @@ def run(self, config: Optional[RetiariiExeConfig] = None, port: int = 8080, debu self.config = config self.start(port, debug) try: - self._start_strategy() + self._run_strategy() + # FIXME: move this logic to strategy with a new API provided by execution engine self._wait_completion() except KeyboardInterrupt: _logger.warning('KeyboardInterrupt detected') self.stop() + _logger.info('Search process is done, the experiment is still alive') def stop(self) -> None: """ From 81ff2469a59541427c347546927be50320866b5b Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 9 May 2022 19:59:56 +0800 Subject: [PATCH 08/77] update --- examples/nas/multi-trial/mnist/search.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/nas/multi-trial/mnist/search.py b/examples/nas/multi-trial/mnist/search.py index 52d1007493..6ee65a70bb 100644 --- a/examples/nas/multi-trial/mnist/search.py +++ b/examples/nas/multi-trial/mnist/search.py @@ -131,7 +131,7 @@ def evaluate_model(model_cls): exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'mnist_search' exp_config.trial_concurrency = 2 - exp_config.max_trial_number = 20 + exp_config.max_trial_number = 4 exp_config.training_service.use_active_gpu = False export_formatter = 'dict' @@ -139,7 +139,8 @@ def evaluate_model(model_cls): # exp_config.execution_engine = 'base' # export_formatter = 'code' - exp.run(exp_config, 8080) + exp.run(exp_config, 8090) print('Final model:') for model_code in exp.export_top_models(formatter=export_formatter): print(model_code) + exp.stop() \ No newline at end of file From 5d3e68122f6058b2f751c3822283d785b9d6b33f Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 10 May 2022 14:45:28 +0800 Subject: [PATCH 09/77] fix pylint --- nni/retiarii/experiment/pytorch.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 69a3ae5d6e..e85c581251 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -16,19 +16,20 @@ import torch import torch.nn as nn -from nni.retiarii.experiment.config.engine_config import * import nni.runtime.log from nni.experiment import Experiment, RunMode, launcher, management from nni.experiment.config import ExperimentConfig from nni.runtime.protocol import connect_websocket -from .config import RetiariiExeConfig, OneshotEngineConfig +from .config import ( + RetiariiExeConfig, ExecutionEngineConfig, OneshotEngineConfig, BaseEngineConfig, + PyEngineConfig, CgoEngineConfig, BenchmarkEngineConfig +) from ..codegen import model_to_pytorch_script from ..converter import convert_to_graph from ..converter.graph_gen import GraphConverterWithShape from ..execution import list_models, set_execution_engine from ..execution.utils import get_mutation_dict -from ..execution.interface import AbstractExecutionEngine from ..graph import Evaluator from ..integration import RetiariiAdvisor from ..mutator import Mutator @@ -351,7 +352,8 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for If ``dict``, the mutation history will be returned. """ if formatter == 'code': - assert not isinstance(self.config.execution_engine, PyEngineConfig), 'You should use `dict` formatter when using Python execution engine.' + assert not isinstance(self.config.execution_engine, PyEngineConfig), \ + 'You should use `dict` formatter when using Python execution engine.' if isinstance(self.evaluator, BaseOneShotTrainer): assert top_k == 1, 'Only support top_k is 1 for now.' return self.evaluator.export() From 9c580d5e24e4d6f87aeb0e22deed6d930396af19 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 10 May 2022 18:03:51 +0800 Subject: [PATCH 10/77] fix pyright --- nni/experiment/experiment.py | 2 +- nni/retiarii/execution/base.py | 3 ++- nni/retiarii/execution/cgo_engine.py | 4 ++-- nni/retiarii/experiment/pytorch.py | 15 +++++++++------ 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index b464e3d48a..4dc732eb15 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -160,7 +160,7 @@ def stop(self) -> None: self._stop() _logger.info('Experiment stopped') - def _wait_completion(self) -> None: + def _wait_completion(self) -> bool: while True: status = self.get_status() if status == 'DONE' or status == 'STOPPED': diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py index d488ce1d4c..8cfe4a315c 100644 --- a/nni/retiarii/execution/base.py +++ b/nni/retiarii/execution/base.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +from __future__ import annotations import logging import os @@ -56,7 +57,7 @@ class BaseExecutionEngine(AbstractExecutionEngine): Resource management is implemented in this class. """ - def __init__(self, rest_port: int = None, rest_url_prefix: str = None) -> None: + def __init__(self, rest_port: int | None = None, rest_url_prefix: str = None) -> None: """ Upon initialization, advisor callbacks need to be registered. Advisor will call the callbacks when the corresponding event has been triggered. diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py index 94459199dd..376d2afe5f 100644 --- a/nni/retiarii/execution/cgo_engine.py +++ b/nni/retiarii/execution/cgo_engine.py @@ -53,8 +53,8 @@ class CGOExecutionEngine(AbstractExecutionEngine): def __init__(self, training_service, max_concurrency: int = None, batch_waiting_time: int = 60, - rest_port: int = None, - rest_url_prefix: str = None + rest_port: int | None = None, + rest_url_prefix: str | None = None ) -> None: self.port = rest_port self.url_prefix = rest_url_prefix diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index e85c581251..cf08c2fec8 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -198,7 +198,7 @@ def __init__(self, base_model: nn.Module, self._proc: Popen | psutil.Process | None = None self._action: Literal['create', 'resume', 'view'] = 'create' self.url_prefix: str | None = None - self.config: RetiariiExeConfig | None = cast(RetiariiExeConfig, None) + self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None) self.base_model = base_model self.evaluator: Union[Evaluator, BaseOneShotTrainer] = evaluator @@ -216,7 +216,8 @@ def _run_strategy(self): base_model_ir, self.applied_mutators = preprocess_model( self.base_model, self.evaluator, self.applied_mutators, full_ir=not isinstance(self.config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)), - dummy_input=self.config.execution_engine.dummy_input if hasattr(self.config.execution_engine, 'dummy_input') else None + dummy_input=self.config.execution_engine.dummy_input + if isinstance(self.config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None ) _logger.info('Start strategy...') @@ -235,12 +236,14 @@ def _create_execution_engine(self) -> None: elif isinstance(self.config.execution_engine, CgoEngineConfig): from ..execution.cgo_engine import CGOExecutionEngine - assert self.config.training_service.platform == 'remote', \ + assert not isinstance(self.config.training_service, list) \ + and self.config.training_service.platform == 'remote', \ "CGO execution engine currently only supports remote training service" - assert self.config.batch_waiting_time is not None and self.config.max_concurrency_cgo is not None + assert self.config.execution_engine.batch_waiting_time is not None \ + and self.config.execution_engine.max_concurrency_cgo is not None engine = CGOExecutionEngine(self.config.training_service, - max_concurrency=self.config.max_concurrency_cgo, - batch_waiting_time=self.config.batch_waiting_time, + max_concurrency=self.config.execution_engine.max_concurrency_cgo, + batch_waiting_time=self.config.execution_engine.batch_waiting_time, rest_port=self.port, rest_url_prefix=self.url_prefix) elif isinstance(self.config.execution_engine, PyEngineConfig): From 1c2f6debf8e742f940f2ce2ce11c817df713a249 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 10 May 2022 18:16:35 +0800 Subject: [PATCH 11/77] update --- nni/experiment/config/base.py | 2 -- nni/experiment/experiment.py | 2 +- nni/retiarii/execution/cgo_engine.py | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/nni/experiment/config/base.py b/nni/experiment/config/base.py index a2de758315..f3d44e063f 100644 --- a/nni/experiment/config/base.py +++ b/nni/experiment/config/base.py @@ -158,9 +158,7 @@ def canonical_copy(self): A deep copy. """ canon = copy.deepcopy(self) - print(type(canon)) canon._canonicalize([]) - print(type(canon)) canon._validate_canonical() return canon diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index 4dc732eb15..3d06c7de34 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -93,7 +93,7 @@ def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig: atexit.register(self.stop) config = self.config.canonical_copy() - if hasattr(config, "use_annotation") and config.use_annotation: #TODO: will be refactored + if config.use_annotation: raise RuntimeError('NNI annotation is not supported by Python experiment API.') if config.experiment_working_directory is not None: diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py index 376d2afe5f..acf3fd3524 100644 --- a/nni/retiarii/execution/cgo_engine.py +++ b/nni/retiarii/execution/cgo_engine.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +from __future__ import annotations import logging import os From 5086e0a96f9b8206a942d40e4b04890862064188 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 10 May 2022 20:44:29 +0800 Subject: [PATCH 12/77] fix pyright --- .../experiment/config/experiment_config.py | 5 ++-- nni/retiarii/experiment/pytorch.py | 25 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py index 4c3e57caf0..dcf9d1d9ad 100644 --- a/nni/retiarii/experiment/config/experiment_config.py +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +from __future__ import annotations import os from dataclasses import dataclass @@ -33,8 +34,8 @@ class RetiariiExeConfig(ExperimentConfig): # new config field for NAS execution_engine: ExecutionEngineConfig = PyEngineConfig() - def __init__(self, training_service_platform: Optional[str] = None, - execution_engine: Union[str, ExecutionEngineConfig] = None, #TODO: having default value or not? + def __init__(self, training_service_platform: str | None = None, + execution_engine: str | ExecutionEngineConfig = PyEngineConfig(), **kwargs): super().__init__(training_service_platform, **kwargs) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index cf08c2fec8..6c92e965d2 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -251,8 +251,9 @@ def _create_execution_engine(self) -> None: engine = PurePythonExecutionEngine(self.port, self.url_prefix) elif isinstance(self.config.execution_engine, BenchmarkEngineConfig): from ..execution.benchmark import BenchmarkExecutionEngine - assert self.config.benchmark is not None, '"benchmark" must be set when benchmark execution engine is used.' - engine = BenchmarkExecutionEngine(self.config.benchmark) + assert self.config.execution_engine.benchmark is not None, \ + '"benchmark" must be set when benchmark execution engine is used.' + engine = BenchmarkExecutionEngine(self.config.execution_engine.benchmark) else: raise ValueError(f'Unsupported engine type: {self.config.execution_engine}') set_execution_engine(engine) @@ -328,7 +329,8 @@ def stop(self) -> None: """ _logger.info('Stopping experiment, please wait...') self._stop() - self._dispatcher_thread.join() + if self._dispatcher_thread: + self._dispatcher_thread.join() self._dispatcher = cast(RetiariiAdvisor, None) self._dispatcher_thread = None _logger.info('Experiment stopped') @@ -380,11 +382,11 @@ def retrain_model(self, model): """ raise NotImplementedError - +""" class NasExperiment(RetiariiExperiment): - """ - This class is only a new interface wrapper. - """ + + #This class is only a new interface wrapper. + def __init__(self, model: nn.Module, evaluator: Union[BaseOneShotTrainer, Evaluator], strategy: BaseStrategy, @@ -394,12 +396,11 @@ def __init__(self, model: nn.Module, ... def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None: - """ - Run the experiment. - This function will block until experiment finish or error. - """ + #Run the experiment. + #This function will block until experiment finish or error. if isinstance(self.config.execution_engine.name, OneshotEngineConfig): base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) self.strategy.run(base_model_ir, self.applied_mutators) else: - super().run(port, wait_completion, debug) \ No newline at end of file + super().run(port, wait_completion, debug) +""" \ No newline at end of file From db9f4e4653a044ef6575797cba0860d4526bb6f9 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 10 May 2022 20:58:19 +0800 Subject: [PATCH 13/77] update --- nni/retiarii/execution/base.py | 2 +- nni/retiarii/experiment/config/experiment_config.py | 2 +- nni/retiarii/experiment/pytorch.py | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py index 8cfe4a315c..869db9f1a5 100644 --- a/nni/retiarii/execution/base.py +++ b/nni/retiarii/execution/base.py @@ -57,7 +57,7 @@ class BaseExecutionEngine(AbstractExecutionEngine): Resource management is implemented in this class. """ - def __init__(self, rest_port: int | None = None, rest_url_prefix: str = None) -> None: + def __init__(self, rest_port: int | None = None, rest_url_prefix: str | None = None) -> None: """ Upon initialization, advisor callbacks need to be registered. Advisor will call the callbacks when the corresponding event has been triggered. diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py index dcf9d1d9ad..456b404aa1 100644 --- a/nni/retiarii/experiment/config/experiment_config.py +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -4,7 +4,7 @@ import os from dataclasses import dataclass -from typing import Any, Optional, Union +from typing import Any from nni.experiment.config import utils, ExperimentConfig diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 6c92e965d2..6736ca2561 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -18,11 +18,10 @@ import torch.nn as nn import nni.runtime.log from nni.experiment import Experiment, RunMode, launcher, management -from nni.experiment.config import ExperimentConfig from nni.runtime.protocol import connect_websocket from .config import ( - RetiariiExeConfig, ExecutionEngineConfig, OneshotEngineConfig, BaseEngineConfig, + RetiariiExeConfig, OneshotEngineConfig, BaseEngineConfig, PyEngineConfig, CgoEngineConfig, BenchmarkEngineConfig ) from ..codegen import model_to_pytorch_script @@ -44,7 +43,7 @@ _logger = logging.getLogger(__name__) -__all__ = ['RetiariiExperiment', 'NasExperiment'] +__all__ = ['RetiariiExperiment'] def preprocess_model(base_model, evaluator, applied_mutators, full_ir=True, dummy_input=None, oneshot=False): From 90971756c9a87a5ffd0a2c1f81121f66fe786c0f Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 10 May 2022 21:48:31 +0800 Subject: [PATCH 14/77] minor --- nni/experiment/experiment.py | 2 ++ nni/retiarii/execution/base.py | 7 +++++++ nni/retiarii/execution/cgo_engine.py | 14 +++++++++----- nni/retiarii/experiment/config/engine_config.py | 2 -- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index 3d06c7de34..feef6677a3 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -126,6 +126,8 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo The port of web UI. debug Whether to start in debug mode. + run_mode + Running the experiment in foreground or background """ config = self._start_begin(debug, run_mode) diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py index 869db9f1a5..c35d357ad0 100644 --- a/nni/retiarii/execution/base.py +++ b/nni/retiarii/execution/base.py @@ -62,6 +62,13 @@ def __init__(self, rest_port: int | None = None, rest_url_prefix: str | None = N Upon initialization, advisor callbacks need to be registered. Advisor will call the callbacks when the corresponding event has been triggered. Base execution engine will get those callbacks and broadcast them to graph listener. + + Parameters + ---------- + rest_port + The port of the experiment's rest server + rest_url_prefix + The url prefix of the experiment's rest entry """ self.port = rest_port self.url_prefix = rest_url_prefix diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py index acf3fd3524..f2d149a1d8 100644 --- a/nni/retiarii/execution/cgo_engine.py +++ b/nni/retiarii/execution/cgo_engine.py @@ -42,16 +42,20 @@ class CGOExecutionEngine(AbstractExecutionEngine): Parameters ---------- - devices : List[Device] - Available devices for execution. - max_concurrency : int + training_service + The remote training service config. + max_concurrency The maximum number of trials to run concurrently. - batch_waiting_time: int + batch_waiting_time Seconds to wait for each batch of trial submission. The trials within one batch could apply cross-graph optimization. + rest_port + The port of the experiment's rest server + rest_url_prefix + The url prefix of the experiment's rest entry """ - def __init__(self, training_service, + def __init__(self, training_service: RemoteConfig, max_concurrency: int = None, batch_waiting_time: int = 60, rest_port: int | None = None, diff --git a/nni/retiarii/experiment/config/engine_config.py b/nni/retiarii/experiment/config/engine_config.py index 042e227012..2147147622 100644 --- a/nni/retiarii/experiment/config/engine_config.py +++ b/nni/retiarii/experiment/config/engine_config.py @@ -11,8 +11,6 @@ @dataclass(init=False) class ExecutionEngineConfig(ConfigBase): - """ - """ name: str @dataclass(init=False) From 879aa567f9ff315295e122b6bbcd8141a1cb1b78 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 10 May 2022 22:00:18 +0800 Subject: [PATCH 15/77] minor --- nni/retiarii/experiment/pytorch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 6736ca2561..8e8243f780 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -18,6 +18,7 @@ import torch.nn as nn import nni.runtime.log from nni.experiment import Experiment, RunMode, launcher, management +from nni.experiment.config.training_services import RemoteConfig from nni.runtime.protocol import connect_websocket from .config import ( @@ -240,7 +241,7 @@ def _create_execution_engine(self) -> None: "CGO execution engine currently only supports remote training service" assert self.config.execution_engine.batch_waiting_time is not None \ and self.config.execution_engine.max_concurrency_cgo is not None - engine = CGOExecutionEngine(self.config.training_service, + engine = CGOExecutionEngine(cast(RemoteConfig, self.config.training_service), max_concurrency=self.config.execution_engine.max_concurrency_cgo, batch_waiting_time=self.config.execution_engine.batch_waiting_time, rest_port=self.port, From 1d723ad77167aa6a048f670aeca3c26f0df7a02c Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 10 May 2022 22:27:48 +0800 Subject: [PATCH 16/77] update --- nni/retiarii/experiment/config/experiment_config.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py index 456b404aa1..b21a5b71df 100644 --- a/nni/retiarii/experiment/config/experiment_config.py +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -39,14 +39,10 @@ def __init__(self, training_service_platform: str | None = None, **kwargs): super().__init__(training_service_platform, **kwargs) - if execution_engine is not None: - # the user chose to init with `config = ExperimentConfig('local')` and set fields later - # we need to create empty training service & algorithm configs to support `config.tuner.name = 'random'` - assert utils.is_missing(self.execution_engine) - if isinstance(execution_engine, str): - self.execution_engine = execution_engine_config_factory(execution_engine) - else: - self.execution_engine = execution_engine + if isinstance(execution_engine, str): + self.execution_engine = execution_engine_config_factory(execution_engine) + else: + self.execution_engine = execution_engine self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name From 3d9e10cea9465d82acd8b2b9b3edbac29bf584de Mon Sep 17 00:00:00 2001 From: quzha Date: Sun, 15 May 2022 17:20:33 +0800 Subject: [PATCH 17/77] resolve some comments --- .../experiment/config/experiment_config.py | 45 ++++++------ nni/retiarii/experiment/pytorch.py | 71 ++++++++++--------- 2 files changed, 60 insertions(+), 56 deletions(-) diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py index b21a5b71df..72bc6c1125 100644 --- a/nni/retiarii/experiment/config/experiment_config.py +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -8,7 +8,7 @@ from nni.experiment.config import utils, ExperimentConfig -from .engine_config import ExecutionEngineConfig, PyEngineConfig +from .engine_config import ExecutionEngineConfig __all__ = ['RetiariiExeConfig'] @@ -32,33 +32,30 @@ class RetiariiExeConfig(ExperimentConfig): trial_code_directory: utils.PathLike = '.' trial_command: str = '_reserved' # new config field for NAS - execution_engine: ExecutionEngineConfig = PyEngineConfig() + execution_engine: str | ExecutionEngineConfig def __init__(self, training_service_platform: str | None = None, - execution_engine: str | ExecutionEngineConfig = PyEngineConfig(), + execution_engine: str | ExecutionEngineConfig = 'py', **kwargs): super().__init__(training_service_platform, **kwargs) - - if isinstance(execution_engine, str): - self.execution_engine = execution_engine_config_factory(execution_engine) - else: - self.execution_engine = execution_engine - - self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name - - def __setattr__(self, key, value): - #TODO: tuner settings can also be blocked here - fixed_attrs = {'search_space': '', - 'trial_command': '_reserved'} - if key in fixed_attrs and fixed_attrs[key] != value: - raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!') - # 'trial_code_directory' is handled differently because the path will be converted to absolute path by us - if key == 'trial_code_directory' and not (str(value) == '.' or os.path.isabs(value)): - raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!') - #if key == 'execution_engine': - # assert value in ['base', 'py', 'cgo', 'benchmark', 'oneshot'], f'The specified execution engine "{value}" is not supported.' - # self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + value - super().__setattr__(key, value) #TODO: double check whether new fields are validated + self.execution_engine = execution_engine + + def _canonicalize(self, _parents): + msg = '{} is not supposed to be set in Retiarii experiment by users, your config is {}.' + if self.search_space != '': + raise ValueError(msg.format('search_space', self.search_space)) + if str(self.trial_code_directory) != '.' and not os.path.isabs(self.trial_code_directory): + raise ValueError(msg.format('trial_code_directory', self.trial_code_directory)) + if self.trial_command != '_reserved' and \ + not self.trial_command.startswith('python3 -m nni.retiarii.trial_entry '): + raise ValueError(msg.format('trial_command', self.trial_command)) + + if isinstance(self.execution_engine, str): + self.execution_engine = execution_engine_config_factory(self.execution_engine) + if self.execution_engine.name in ('py', 'base', 'cgo'): + self.trial_command = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name + + super()._canonicalize([self]) def _validate_canonical(self): super()._validate_canonical(False) \ No newline at end of file diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 8e8243f780..0dba05cf36 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -8,7 +8,7 @@ import warnings from subprocess import Popen from threading import Thread -from typing import Any, List, Optional, Union, cast +from typing import Any, List, Union, cast import colorama import psutil @@ -205,6 +205,9 @@ def __init__(self, base_model: nn.Module, self.applied_mutators = applied_mutators self.strategy = strategy + self._dispatcher = None + self._dispatcher_thread = None + # check for sanity if not is_model_wrapped(base_model): warnings.warn(colorama.Style.BRIGHT + colorama.Fore.RED + @@ -212,12 +215,12 @@ def __init__(self, base_model: nn.Module, 'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL, RuntimeWarning) - def _run_strategy(self): + def _run_strategy(self, config: RetiariiExeConfig): base_model_ir, self.applied_mutators = preprocess_model( self.base_model, self.evaluator, self.applied_mutators, - full_ir=not isinstance(self.config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)), - dummy_input=self.config.execution_engine.dummy_input - if isinstance(self.config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None + full_ir=not isinstance(config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)), + dummy_input=config.execution_engine.dummy_input + if isinstance(config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None ) _logger.info('Start strategy...') @@ -226,39 +229,39 @@ def _run_strategy(self): self.strategy.run(base_model_ir, self.applied_mutators) _logger.info('Strategy exit') # TODO: find out a proper way to show no more trial message on WebUI - # self._dispatcher.mark_experiment_as_ending() - def _create_execution_engine(self) -> None: + def _create_execution_engine(self, config: RetiariiExeConfig) -> None: #TODO: we will probably need a execution engine factory to make this clean and elegant - if isinstance(self.config.execution_engine, BaseEngineConfig): + if isinstance(config.execution_engine, BaseEngineConfig): from ..execution.base import BaseExecutionEngine engine = BaseExecutionEngine(self.port, self.url_prefix) - elif isinstance(self.config.execution_engine, CgoEngineConfig): + elif isinstance(config.execution_engine, CgoEngineConfig): from ..execution.cgo_engine import CGOExecutionEngine - assert not isinstance(self.config.training_service, list) \ - and self.config.training_service.platform == 'remote', \ + assert not isinstance(config.training_service, list) \ + and config.training_service.platform == 'remote', \ "CGO execution engine currently only supports remote training service" - assert self.config.execution_engine.batch_waiting_time is not None \ - and self.config.execution_engine.max_concurrency_cgo is not None - engine = CGOExecutionEngine(cast(RemoteConfig, self.config.training_service), - max_concurrency=self.config.execution_engine.max_concurrency_cgo, - batch_waiting_time=self.config.execution_engine.batch_waiting_time, + assert config.execution_engine.batch_waiting_time is not None \ + and config.execution_engine.max_concurrency_cgo is not None + engine = CGOExecutionEngine(cast(RemoteConfig, config.training_service), + max_concurrency=config.execution_engine.max_concurrency_cgo, + batch_waiting_time=config.execution_engine.batch_waiting_time, rest_port=self.port, rest_url_prefix=self.url_prefix) - elif isinstance(self.config.execution_engine, PyEngineConfig): + elif isinstance(config.execution_engine, PyEngineConfig): from ..execution.python import PurePythonExecutionEngine engine = PurePythonExecutionEngine(self.port, self.url_prefix) - elif isinstance(self.config.execution_engine, BenchmarkEngineConfig): + elif isinstance(config.execution_engine, BenchmarkEngineConfig): from ..execution.benchmark import BenchmarkExecutionEngine - assert self.config.execution_engine.benchmark is not None, \ + assert config.execution_engine.benchmark is not None, \ '"benchmark" must be set when benchmark execution engine is used.' - engine = BenchmarkExecutionEngine(self.config.execution_engine.benchmark) + engine = BenchmarkExecutionEngine(config.execution_engine.benchmark) else: - raise ValueError(f'Unsupported engine type: {self.config.execution_engine}') + raise ValueError(f'Unsupported engine type: {config.execution_engine}') set_execution_engine(engine) - def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None: + def start(self, port: int = 8080, debug: bool = False, + run_mode: RunMode = RunMode.Background) -> RetiariiExeConfig: """ Start the experiment in background. This method will raise exception on failure. @@ -282,14 +285,12 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo self._start_end(port, config.nni_manager_ip) self._dispatcher = RetiariiAdvisor() - # dispatcher must be launched after pipe initialized - # the logic to launch dispatcher in background should be refactored into dispatcher api self._dispatcher_thread = Thread(target=self._dispatcher.run) self._dispatcher_thread.start() + return config - self._create_execution_engine() # FIXME: engine cannot be created twice - - def run(self, config: Optional[RetiariiExeConfig] = None, + def run(self, + config: RetiariiExeConfig | None = None, port: int = 8080, debug: bool = False) -> None: """ @@ -308,20 +309,26 @@ def run(self, config: Optional[RetiariiExeConfig] = None, if config is None: self.config = RetiariiExeConfig() self.config.execution_engine = OneshotEngineConfig() + else: + self.config = config + + if isinstance(self.config.execution_engine, OneshotEngineConfig) \ + or (isinstance(self.config.execution_engine, str) and self.config.execution_engine == 'oneshot'): + # this is hacky, will be refactored when oneshot can run on training services base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) - # FIXME: oneshot strategy should also be executable on training services self.strategy.run(base_model_ir, self.applied_mutators) else: - self.config = config - self.start(port, debug) + config = self.start(port, debug) + # FIXME: engine cannot be created twice + self._create_execution_engine(config) try: - self._run_strategy() + self._run_strategy(config) # FIXME: move this logic to strategy with a new API provided by execution engine self._wait_completion() except KeyboardInterrupt: _logger.warning('KeyboardInterrupt detected') self.stop() - _logger.info('Search process is done, the experiment is still alive') + _logger.info('Search process is done, the experiment is still alive, `stop()` can terminate the experiment.') def stop(self) -> None: """ From a8c15ea7ed7ce942782d849ac46e088dbc68ceff Mon Sep 17 00:00:00 2001 From: quzha Date: Sun, 15 May 2022 18:05:28 +0800 Subject: [PATCH 18/77] resolve comments --- nni/retiarii/experiment/pytorch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 0dba05cf36..c87cdcf1bf 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -307,6 +307,8 @@ def run(self, return if config is None: + warnings.warn('config = None is deprecate in future. If you are running a one-shot experiment, ' + 'please consider creating a config and set execution engine to `oneshot`.', DeprecationWarning) self.config = RetiariiExeConfig() self.config.execution_engine = OneshotEngineConfig() else: From 5f4b32c52f5c2b781d164af5e04c48857c77952a Mon Sep 17 00:00:00 2001 From: quzha Date: Sun, 15 May 2022 22:37:07 +0800 Subject: [PATCH 19/77] minor --- nni/retiarii/experiment/pytorch.py | 8 -------- nni/runtime/msg_dispatcher_base.py | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index ce8293391e..591897014b 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -224,8 +224,6 @@ def _run_strategy(self, config: RetiariiExeConfig): _logger.info('Start strategy...') search_space = dry_run_for_formatted_search_space(base_model_ir, self.applied_mutators) - #import time - #time.sleep(10) self.update_search_space(search_space) self.strategy.run(base_model_ir, self.applied_mutators) _logger.info('Strategy exit') @@ -284,12 +282,6 @@ def start(self, port: int = 8080, debug: bool = False, self._start_end(port, config.nni_manager_ip) - #from nni.experiment import rest - #from collections import OrderedDict - #import time - #time.sleep(10) - #rest.put(8090, '/experiment?update_type=SEARCH_SPACE', {'params': {'experimentName': 'mnist_search', 'searchSpace': OrderedDict([('model_1', {'_type': 'choice', '_value': ['0', '1']}), ('model_2', {'_type': 'choice', '_value': [0.25, 0.5, 0.75]}), ('model_3', {'_type': 'choice', '_value': [64, 128, 256]})]), 'trialCommand': 'python3 -m nni.retiarii.trial_entry py', 'trialCodeDirectory': '.', 'trialConcurrency': 2, 'maxTrialNumber': 4, 'useAnnotation': False, 'debug': False, 'logLevel': 'info', 'experimentWorkingDirectory': '/home/quzha/nni-experiments', 'trainingService': {'platform': 'local', 'trialCommand': 'python3 -m nni.retiarii.trial_entry py', 'trialCodeDirectory': '/home/quzha/nni/nni/examples/nas/multi-trial/mnist', 'debug': False, 'useActiveGpu': False, 'maxTrialNumberPerGpu': 1, 'reuseMode': False}, 'executionEngine': {'name': 'py'}}, 'id': 'mn7j1h0g', 'execDuration': 0, 'logDir': '/home/quzha/nni-experiments/mn7j1h0g', 'startTime': 1652616830914, 'nextSequenceId': 0, 'revision': 2}) - self._dispatcher = RetiariiAdvisor(ws_url) self._dispatcher_thread = Thread(target=self._dispatcher.run) self._dispatcher_thread.start() diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py index 5f57937db8..43c14b4af7 100644 --- a/nni/runtime/msg_dispatcher_base.py +++ b/nni/runtime/msg_dispatcher_base.py @@ -27,7 +27,7 @@ def __init__(self, command_channel_url=None): if command_channel_url is None: command_channel_url = dispatcher_env_vars.NNI_TUNER_COMMAND_CHANNEL self._channel = TunerCommandChannel(command_channel_url) - # NOTE: `connect()` should be put in __init__. First, this `connect()` affects nnimanager's + # NOTE: `connect()` should be put in __init__. First, this `connect()` affects nnimanager's # starting process, without `connect()` nnimanager is blocked in `dispatcher.init()`. # Second, nas experiment uses a thread to execute `run()` of this class, thus, there is # no way to know when the websocket between nnimanager and dispatcher is built. The following From 6743fa79ec74de5b54e5cf765e99cf25e5315637 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 16 May 2022 08:30:08 +0800 Subject: [PATCH 20/77] pyright --- nni/retiarii/experiment/pytorch.py | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 591897014b..494a828b74 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -285,7 +285,7 @@ def start(self, port: int = 8080, debug: bool = False, self._dispatcher = RetiariiAdvisor(ws_url) self._dispatcher_thread = Thread(target=self._dispatcher.run) self._dispatcher_thread.start() - return config + return cast(RetiariiExeConfig, config) def run(self, config: RetiariiExeConfig | None = None, @@ -388,26 +388,3 @@ def retrain_model(self, model): this function retrains the exported model, and test it to output test accuracy """ raise NotImplementedError - -""" -class NasExperiment(RetiariiExperiment): - - #This class is only a new interface wrapper. - - def __init__(self, model: nn.Module, - evaluator: Union[BaseOneShotTrainer, Evaluator], - strategy: BaseStrategy, - config_or_platform: ExperimentConfig | str | list[str] | None = 'local', - execution_engine: Union[str, ExecutionEngineConfig] = 'py', - mutators: List[Mutator] = cast(List[Mutator], None)): - ... - - def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool | None: - #Run the experiment. - #This function will block until experiment finish or error. - if isinstance(self.config.execution_engine.name, OneshotEngineConfig): - base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) - self.strategy.run(base_model_ir, self.applied_mutators) - else: - super().run(port, wait_completion, debug) -""" \ No newline at end of file From d1ea7f5a68de46f4ef64684f1dd9b50f2fd2368d Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 16 May 2022 14:14:35 +0800 Subject: [PATCH 21/77] fix ut --- nni/runtime/msg_dispatcher_base.py | 13 +++++++++++-- test/ut/retiarii/test_cgo_engine.py | 6 +++--- test/ut/retiarii/test_engine.py | 4 ++-- test/ut/sdk/test_assessor.py | 2 +- test/ut/sdk/test_msg_dispatcher.py | 2 +- 5 files changed, 18 insertions(+), 9 deletions(-) diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py index 43c14b4af7..82d90d9549 100644 --- a/nni/runtime/msg_dispatcher_base.py +++ b/nni/runtime/msg_dispatcher_base.py @@ -18,8 +18,15 @@ class MsgDispatcherBase(Recoverable): - """This is where tuners and assessors are not defined yet. + """ + This is where tuners and assessors are not defined yet. Inherits this class to make your own advisor. + + .. note:: + + The class inheriting MsgDispatcherBase should be instantiated + after nnimanager (rest server) is started, so that the object + is ready to use right after its instantiation. """ def __init__(self, command_channel_url=None): @@ -34,7 +41,9 @@ def __init__(self, command_channel_url=None): # logic may crash is websocket is not built. One example is updating search space. If updating # search space too soon, as the websocket has not been built, the rest api of updating search # space will timeout. - self._channel.connect() + # FIXME: this is making unittest happy + if command_channel_url.startswith('ws://_unittest_'): + self._channel.connect() self.default_command_queue = Queue() self.assessor_command_queue = Queue() self.default_worker = threading.Thread(target=self.command_queue_worker, args=(self.default_command_queue,)) diff --git a/test/ut/retiarii/test_cgo_engine.py b/test/ut/retiarii/test_cgo_engine.py index 67dde09380..8d67b26630 100644 --- a/test/ut/retiarii/test_cgo_engine.py +++ b/test/ut/retiarii/test_cgo_engine.py @@ -263,7 +263,7 @@ def test_dedup_input_four_devices(self): opt = DedupInputOptimizer() opt.convert(lp) - advisor = RetiariiAdvisor('ws://_placeholder_') + advisor = RetiariiAdvisor('ws://_unittest_placeholder_') advisor._channel = protocol.LegacyCommandChannel() advisor.default_worker.start() advisor.assessor_worker.start() @@ -286,7 +286,7 @@ def test_dedup_input_two_devices(self): opt = DedupInputOptimizer() opt.convert(lp) - advisor = RetiariiAdvisor('ws://_placeholder_') + advisor = RetiariiAdvisor('ws://_unittest_placeholder_') advisor._channel = protocol.LegacyCommandChannel() advisor.default_worker.start() advisor.assessor_worker.start() @@ -311,7 +311,7 @@ def test_submit_models(self): models = _load_mnist(2) - advisor = RetiariiAdvisor('ws://_placeholder_') + advisor = RetiariiAdvisor('ws://_unittest_placeholder_') advisor._channel = protocol.LegacyCommandChannel() advisor.default_worker.start() advisor.assessor_worker.start() diff --git a/test/ut/retiarii/test_engine.py b/test/ut/retiarii/test_engine.py index 8e8f050c1a..c8cd760b8c 100644 --- a/test/ut/retiarii/test_engine.py +++ b/test/ut/retiarii/test_engine.py @@ -25,7 +25,7 @@ def test_codegen(self): def test_base_execution_engine(self): nni.retiarii.integration_api._advisor = None nni.retiarii.execution.api._execution_engine = None - advisor = RetiariiAdvisor('ws://_placeholder_') + advisor = RetiariiAdvisor('ws://_unittest_placeholder_') advisor._channel = LegacyCommandChannel() advisor.default_worker.start() advisor.assessor_worker.start() @@ -42,7 +42,7 @@ def test_base_execution_engine(self): def test_py_execution_engine(self): nni.retiarii.integration_api._advisor = None nni.retiarii.execution.api._execution_engine = None - advisor = RetiariiAdvisor('ws://_placeholder_') + advisor = RetiariiAdvisor('ws://_unittest_placeholder_') advisor._channel = LegacyCommandChannel() advisor.default_worker.start() advisor.assessor_worker.start() diff --git a/test/ut/sdk/test_assessor.py b/test/ut/sdk/test_assessor.py index 0d5e078027..48c2c03324 100644 --- a/test/ut/sdk/test_assessor.py +++ b/test/ut/sdk/test_assessor.py @@ -57,7 +57,7 @@ def test_assessor(self): _restore_io() assessor = NaiveAssessor() - dispatcher = MsgDispatcher('ws://_placeholder_', None, assessor) + dispatcher = MsgDispatcher('ws://_unittest_placeholder_', None, assessor) dispatcher._channel = LegacyCommandChannel() msg_dispatcher_base._worker_fast_exit_on_terminate = False diff --git a/test/ut/sdk/test_msg_dispatcher.py b/test/ut/sdk/test_msg_dispatcher.py index 356308501c..643d4d9b7b 100644 --- a/test/ut/sdk/test_msg_dispatcher.py +++ b/test/ut/sdk/test_msg_dispatcher.py @@ -66,7 +66,7 @@ def test_msg_dispatcher(self): _restore_io() tuner = NaiveTuner() - dispatcher = MsgDispatcher('ws://_placeholder_', tuner) + dispatcher = MsgDispatcher('ws://_unittest_placeholder_', tuner) dispatcher._channel = LegacyCommandChannel() msg_dispatcher_base._worker_fast_exit_on_terminate = False From a3b55c2b2ca9e8d96f65a4db05c6b0ff38d2df4e Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 16 May 2022 15:57:51 +0800 Subject: [PATCH 22/77] minor --- nni/runtime/msg_dispatcher_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py index 82d90d9549..6873ff2409 100644 --- a/nni/runtime/msg_dispatcher_base.py +++ b/nni/runtime/msg_dispatcher_base.py @@ -42,7 +42,7 @@ def __init__(self, command_channel_url=None): # search space too soon, as the websocket has not been built, the rest api of updating search # space will timeout. # FIXME: this is making unittest happy - if command_channel_url.startswith('ws://_unittest_'): + if not command_channel_url.startswith('ws://_unittest_'): self._channel.connect() self.default_command_queue = Queue() self.assessor_command_queue = Queue() From f895116015efd6c5454d013138f4511d05b171dd Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 16 May 2022 18:06:16 +0800 Subject: [PATCH 23/77] fix cgo pipe --- test/ut/retiarii/test_cgo_engine.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/test/ut/retiarii/test_cgo_engine.py b/test/ut/retiarii/test_cgo_engine.py index 8d67b26630..5a3605eb21 100644 --- a/test/ut/retiarii/test_cgo_engine.py +++ b/test/ut/retiarii/test_cgo_engine.py @@ -9,6 +9,7 @@ from pathlib import Path import nni +from nni.experiment.config import RemoteConfig, RemoteMachineConfig import nni.runtime.platform.test from nni.runtime.tuner_command_channel import legacy as protocol import json @@ -268,8 +269,9 @@ def test_dedup_input_four_devices(self): advisor.default_worker.start() advisor.assessor_worker.start() - available_devices = [GPUDevice("test", 0), GPUDevice("test", 1), GPUDevice("test", 2), GPUDevice("test", 3)] - cgo = CGOExecutionEngine(devices=available_devices, batch_waiting_time=0) + remote = RemoteConfig(machine_list=[]) + remote.machine_list.append(RemoteMachineConfig(host='test', gpu_indices=[0,1,2,3])) + cgo = CGOExecutionEngine(training_service=remote, batch_waiting_time=0) phy_models = cgo._assemble(lp) self.assertTrue(len(phy_models) == 1) @@ -291,8 +293,9 @@ def test_dedup_input_two_devices(self): advisor.default_worker.start() advisor.assessor_worker.start() - available_devices = [GPUDevice("test", 0), GPUDevice("test", 1)] - cgo = CGOExecutionEngine(devices=available_devices, batch_waiting_time=0) + remote = RemoteConfig(machine_list=[]) + remote.machine_list.append(RemoteMachineConfig(host='test', gpu_indices=[0,1])) + cgo = CGOExecutionEngine(training_service=remote, batch_waiting_time=0) phy_models = cgo._assemble(lp) self.assertTrue(len(phy_models) == 2) @@ -316,8 +319,9 @@ def test_submit_models(self): advisor.default_worker.start() advisor.assessor_worker.start() - cgo_engine = CGOExecutionEngine(devices=[GPUDevice("test", 0), GPUDevice("test", 1), - GPUDevice("test", 2), GPUDevice("test", 3)], batch_waiting_time=0) + remote = RemoteConfig(machine_list=[]) + remote.machine_list.append(RemoteMachineConfig(host='test', gpu_indices=[0,1,2,3])) + cgo_engine = CGOExecutionEngine(training_service=remote, batch_waiting_time=0) set_execution_engine(cgo_engine) submit_models(*models) time.sleep(3) From 33fd0b048b7cf8da92a47fa4baae3e7da0a47d0b Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 16 May 2022 20:02:49 +0800 Subject: [PATCH 24/77] refactor --- nni/experiment/experiment.py | 25 +++++++------- nni/retiarii/experiment/pytorch.py | 52 +++++++----------------------- 2 files changed, 25 insertions(+), 52 deletions(-) diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index feef6677a3..b9a07a8a1a 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -87,7 +87,10 @@ def __init__(self, config_or_platform: ExperimentConfig | str | list[str] | None else: self.config = config_or_platform - def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig: + def _start_impl(self, port: int, debug: bool, run_mode: RunMode, + url_prefix: str | None, + tuner_command_channel: str | None, + tags: list[str] = []) -> ExperimentConfig: assert self.config is not None if run_mode is not RunMode.Detach: atexit.register(self.stop) @@ -101,10 +104,14 @@ def _start_begin(self, debug: bool, run_mode: RunMode) -> ExperimentConfig: else: # this should never happen in latest version, keep it until v2.7 for potential compatibility log_dir = Path.home() / f'nni-experiments/{self.id}/log' nni.runtime.log.start_experiment_log(self.id, log_dir, debug) - return config - def _start_end(self, port: int, nni_manager_ip: Optional[str]) -> None: - ips = [nni_manager_ip] + self._proc = launcher.start_experiment(self._action, self.id, config, port, debug, run_mode, + url_prefix, tuner_command_channel, tags) + assert self._proc is not None + + self.port = port # port will be None if start up failed + + ips = [config.nni_manager_ip] for interfaces in psutil.net_if_addrs().values(): for interface in interfaces: if interface.family == socket.AF_INET: @@ -112,6 +119,7 @@ def _start_end(self, port: int, nni_manager_ip: Optional[str]) -> None: ips = [f'http://{ip}:{port}' for ip in ips if ip] msg = 'Web portal URLs: ' + colorama.Fore.CYAN + ' '.join(ips) + colorama.Style.RESET_ALL _logger.info(msg) + return config def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None: """ @@ -129,14 +137,7 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo run_mode Running the experiment in foreground or background """ - config = self._start_begin(debug, run_mode) - - self._proc = launcher.start_experiment(self._action, self.id, config, port, debug, run_mode, self.url_prefix) - assert self._proc is not None - - self.port = port # port will be None if start up failed - - self._start_end(port, config.nni_manager_ip) + self._start_impl(port, debug, run_mode, self.url_prefix, None, []) def _stop(self) -> None: atexit.unregister(self.stop) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 494a828b74..a8d9f38823 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -182,7 +182,8 @@ def __init__(self, base_model: nn.Module, applied_mutators: List[Mutator] = cast(List[Mutator], None), strategy: BaseStrategy = cast(BaseStrategy, None), trainer: BaseOneShotTrainer = cast(BaseOneShotTrainer, None)): - nni.runtime.log.init_logger_for_command_line() + super().__init__(None) + self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None) if trainer is not None: warnings.warn('Usage of `trainer` in RetiariiExperiment is deprecated and will be removed soon. ' @@ -192,13 +193,6 @@ def __init__(self, base_model: nn.Module, if evaluator is None: raise ValueError('Evaluator should not be none.') - self.id: str = management.generate_experiment_id() - self.port: int | None = None - self._proc: Popen | psutil.Process | None = None - self._action: Literal['create', 'resume', 'view'] = 'create' - self.url_prefix: str | None = None - self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None) - self.base_model = base_model self.evaluator: Union[Evaluator, BaseOneShotTrainer] = evaluator self.applied_mutators = applied_mutators @@ -259,33 +253,13 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None: raise ValueError(f'Unsupported engine type: {config.execution_engine}') set_execution_engine(engine) - def start(self, port: int = 8080, debug: bool = False, - run_mode: RunMode = RunMode.Background) -> RetiariiExeConfig: + def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None: """ - Start the experiment in background. - This method will raise exception on failure. - If it returns, the experiment should have been successfully started. - Parameters - ---------- - port - The port of web UI. - debug - Whether to start in debug mode. + By design, the only different between `start` and `run` is that `start` is asynchronous, + while `run` waits the experiment to complete. RetiariiExperiment always waits the experiment + to complete as strategy runs in foreground. """ - config = self._start_begin(debug, run_mode) - - ws_url = f'ws://localhost:{port}/tuner' - self._proc = launcher.start_experiment('create', self.id, config, port, debug, # type: ignore - RunMode.Background, None, ws_url, ['retiarii']) - assert self._proc is not None - self.port = port # port will be None if start up failed - - self._start_end(port, config.nni_manager_ip) - - self._dispatcher = RetiariiAdvisor(ws_url) - self._dispatcher_thread = Thread(target=self._dispatcher.run) - self._dispatcher_thread.start() - return cast(RetiariiExeConfig, config) + raise NotImplementedError('RetiariiExperiment is not supposed to provide `start` method') def run(self, config: RetiariiExeConfig | None = None, @@ -318,7 +292,11 @@ def run(self, base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True) self.strategy.run(base_model_ir, self.applied_mutators) else: - config = self.start(port, debug) + ws_url = f'ws://localhost:{port}/tuner' + config = self._start_impl(port, debug, RunMode.Background, None, ws_url, ['retiarii']) + self._dispatcher = RetiariiAdvisor(ws_url) + self._dispatcher_thread = Thread(target=self._dispatcher.run) + self._dispatcher_thread.start() # FIXME: engine cannot be created twice self._create_execution_engine(config) try: @@ -382,9 +360,3 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for return [model_to_pytorch_script(model) for model in all_models[:top_k]] elif formatter == 'dict': return [get_mutation_dict(model) for model in all_models[:top_k]] - - def retrain_model(self, model): - """ - this function retrains the exported model, and test it to output test accuracy - """ - raise NotImplementedError From 7609983a689be87e0932ae030988b91ebe3afe0c Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 16 May 2022 20:34:08 +0800 Subject: [PATCH 25/77] fix pylint --- nni/experiment/experiment.py | 2 +- nni/retiarii/experiment/pytorch.py | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index b9a07a8a1a..194fc6e572 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -10,7 +10,7 @@ import socket from subprocess import Popen import time -from typing import Any, Optional +from typing import Any import colorama import psutil diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index a8d9f38823..2ddaefbe79 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -6,18 +6,14 @@ import logging import warnings -from subprocess import Popen from threading import Thread from typing import Any, List, Union, cast import colorama -import psutil -from typing_extensions import Literal import torch import torch.nn as nn -import nni.runtime.log -from nni.experiment import Experiment, RunMode, launcher, management +from nni.experiment import Experiment, RunMode from nni.experiment.config.training_services import RemoteConfig from .config import ( From c51a5200407106a09a578410d882e20cff40e875 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 16 May 2022 20:38:09 +0800 Subject: [PATCH 26/77] minor --- nni/retiarii/execution/api.py | 8 +++++--- nni/retiarii/experiment/pytorch.py | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/nni/retiarii/execution/api.py b/nni/retiarii/execution/api.py index d0028e5e72..1d9c6bf5c5 100644 --- a/nni/retiarii/execution/api.py +++ b/nni/retiarii/execution/api.py @@ -2,6 +2,7 @@ # Licensed under the MIT license. import time +import warnings from typing import Iterable from ..graph import Model, ModelStatus @@ -21,9 +22,10 @@ def set_execution_engine(engine: AbstractExecutionEngine) -> None: if _execution_engine is None: _execution_engine = engine else: - raise RuntimeError('Execution engine is already set. ' - 'You should avoid instantiating RetiariiExperiment twice in one process. ' - 'If you are running in a Jupyter notebook, please restart the kernel.') + warnings.warn('Execution engine is already set. ' + 'You should avoid instantiating RetiariiExperiment twice in one process. ' + 'If you are running in a Jupyter notebook, please restart the kernel.', + RuntimeWarning) def get_execution_engine() -> AbstractExecutionEngine: diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 2ddaefbe79..bb1cb9259d 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -338,7 +338,8 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for If ``dict``, the mutation history will be returned. """ if formatter == 'code': - assert not isinstance(self.config.execution_engine, PyEngineConfig), \ + config = self.config.canonical_copy() + assert not isinstance(config.execution_engine, PyEngineConfig), \ 'You should use `dict` formatter when using Python execution engine.' if isinstance(self.evaluator, BaseOneShotTrainer): assert top_k == 1, 'Only support top_k is 1 for now.' From 7edef1abec6dbd9139ecc7dcc15ed3d371f785d4 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 16 May 2022 22:08:04 +0800 Subject: [PATCH 27/77] fix pyright --- nni/retiarii/experiment/pytorch.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index bb1cb9259d..849fd02863 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -289,14 +289,15 @@ def run(self, self.strategy.run(base_model_ir, self.applied_mutators) else: ws_url = f'ws://localhost:{port}/tuner' - config = self._start_impl(port, debug, RunMode.Background, None, ws_url, ['retiarii']) + canonicalized_config = self._start_impl(port, debug, RunMode.Background, None, ws_url, ['retiarii']) + canonicalized_config = cast(RetiariiExeConfig, canonicalized_config) self._dispatcher = RetiariiAdvisor(ws_url) self._dispatcher_thread = Thread(target=self._dispatcher.run) self._dispatcher_thread.start() # FIXME: engine cannot be created twice - self._create_execution_engine(config) + self._create_execution_engine(canonicalized_config) try: - self._run_strategy(config) + self._run_strategy(canonicalized_config) # FIXME: move this logic to strategy with a new API provided by execution engine self._wait_completion() except KeyboardInterrupt: From 644cc72211d8545b0bee491016ab14872d21b0c7 Mon Sep 17 00:00:00 2001 From: quzha Date: Fri, 20 May 2022 12:20:30 +0800 Subject: [PATCH 28/77] resolve comments --- nni/experiment/config/experiment_config.py | 4 ++-- nni/experiment/experiment.py | 4 ++-- nni/retiarii/execution/api.py | 5 ++--- nni/retiarii/experiment/config/experiment_config.py | 3 --- nni/retiarii/experiment/pytorch.py | 5 ++--- nni/retiarii/integration_api.py | 5 ++++- 6 files changed, 12 insertions(+), 14 deletions(-) diff --git a/nni/experiment/config/experiment_config.py b/nni/experiment/config/experiment_config.py index e8791c0bf7..64af5e3af1 100644 --- a/nni/experiment/config/experiment_config.py +++ b/nni/experiment/config/experiment_config.py @@ -141,7 +141,7 @@ def _canonicalize(self, _parents): msg = f'nni_manager_ip is not set, please make sure {ip} is accessible from training machines' logging.getLogger('nni.experiment.config').warning(msg) - def _validate_canonical(self, validate_tuner: bool = True): # FIXME: remove validate_tuner + def _validate_canonical(self): super()._validate_canonical() space_cnt = (self.search_space is not None) + (self.search_space_file is not None) @@ -164,7 +164,7 @@ def _validate_canonical(self, validate_tuner: bool = True): # FIXME: remove vali # currently I have only seen one issue of this kind #Path(self.experiment_working_directory).mkdir(parents=True, exist_ok=True) - if validate_tuner: + if type(self).__name__ != 'RetiariiExeConfig': utils.validate_gpu_indices(self.tuner_gpu_indices) if self.tuner is None: diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index 194fc6e572..3438221b4b 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -139,7 +139,7 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo """ self._start_impl(port, debug, run_mode, self.url_prefix, None, []) - def _stop(self) -> None: + def _stop_impl(self) -> None: atexit.unregister(self.stop) nni.runtime.log.stop_experiment_log(self.id) @@ -160,7 +160,7 @@ def stop(self) -> None: Stop the experiment. """ _logger.info('Stopping experiment, please wait...') - self._stop() + self._stop_impl() _logger.info('Experiment stopped') def _wait_completion(self) -> bool: diff --git a/nni/retiarii/execution/api.py b/nni/retiarii/execution/api.py index 1d9c6bf5c5..01c85f81ee 100644 --- a/nni/retiarii/execution/api.py +++ b/nni/retiarii/execution/api.py @@ -19,13 +19,12 @@ def set_execution_engine(engine: AbstractExecutionEngine) -> None: global _execution_engine - if _execution_engine is None: - _execution_engine = engine - else: + if _execution_engine is not None: warnings.warn('Execution engine is already set. ' 'You should avoid instantiating RetiariiExperiment twice in one process. ' 'If you are running in a Jupyter notebook, please restart the kernel.', RuntimeWarning) + _execution_engine = engine def get_execution_engine() -> AbstractExecutionEngine: diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py index 72bc6c1125..945bf3704e 100644 --- a/nni/retiarii/experiment/config/experiment_config.py +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -56,6 +56,3 @@ def _canonicalize(self, _parents): self.trial_command = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name super()._canonicalize([self]) - - def _validate_canonical(self): - super()._validate_canonical(False) \ No newline at end of file diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 849fd02863..6f4aeef3c3 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -310,7 +310,7 @@ def stop(self) -> None: Stop background experiment. """ _logger.info('Stopping experiment, please wait...') - self._stop() + self._stop_impl() if self._dispatcher_thread: self._dispatcher_thread.join() self._dispatcher = cast(RetiariiAdvisor, None) @@ -319,8 +319,6 @@ def stop(self) -> None: def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', formatter: str = 'dict') -> Any: """ - TODO: the base class may also need this method - Export several top performing models. For one-shot algorithms, only top-1 is supported. For others, ``optimize_mode`` and ``formatter`` are @@ -338,6 +336,7 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for If ``code``, the python code of model will be returned. If ``dict``, the mutation history will be returned. """ + # TODO: the base class may also need this method if formatter == 'code': config = self.config.canonical_copy() assert not isinstance(config.execution_engine, PyEngineConfig), \ diff --git a/nni/retiarii/integration_api.py b/nni/retiarii/integration_api.py index dfc77bdc2b..643758ec2a 100644 --- a/nni/retiarii/integration_api.py +++ b/nni/retiarii/integration_api.py @@ -22,7 +22,10 @@ def get_advisor() -> 'RetiariiAdvisor': def register_advisor(advisor: 'RetiariiAdvisor'): global _advisor - assert _advisor is None + if _advisor is not None: + warnings.warn('Advisor is already set.' + 'You should avoid instantiating RetiariiExperiment twice in one proces.' + 'If you are running in a Jupyter notebook, please restart the kernel.') _advisor = advisor From d610d43d300a3485f628f6e004a7bdfee3e7840d Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 23 May 2022 09:25:48 +0800 Subject: [PATCH 29/77] resolve all the comments --- examples/nas/multi-trial/mnist/search.py | 5 ++--- nni/experiment/experiment.py | 5 ++--- nni/retiarii/experiment/pytorch.py | 6 +++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/examples/nas/multi-trial/mnist/search.py b/examples/nas/multi-trial/mnist/search.py index 6ee65a70bb..52d1007493 100644 --- a/examples/nas/multi-trial/mnist/search.py +++ b/examples/nas/multi-trial/mnist/search.py @@ -131,7 +131,7 @@ def evaluate_model(model_cls): exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'mnist_search' exp_config.trial_concurrency = 2 - exp_config.max_trial_number = 4 + exp_config.max_trial_number = 20 exp_config.training_service.use_active_gpu = False export_formatter = 'dict' @@ -139,8 +139,7 @@ def evaluate_model(model_cls): # exp_config.execution_engine = 'base' # export_formatter = 'code' - exp.run(exp_config, 8090) + exp.run(exp_config, 8080) print('Final model:') for model_code in exp.export_top_models(formatter=export_formatter): print(model_code) - exp.stop() \ No newline at end of file diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index 3438221b4b..a5608e8dd8 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -88,7 +88,6 @@ def __init__(self, config_or_platform: ExperimentConfig | str | list[str] | None self.config = config_or_platform def _start_impl(self, port: int, debug: bool, run_mode: RunMode, - url_prefix: str | None, tuner_command_channel: str | None, tags: list[str] = []) -> ExperimentConfig: assert self.config is not None @@ -106,7 +105,7 @@ def _start_impl(self, port: int, debug: bool, run_mode: RunMode, nni.runtime.log.start_experiment_log(self.id, log_dir, debug) self._proc = launcher.start_experiment(self._action, self.id, config, port, debug, run_mode, - url_prefix, tuner_command_channel, tags) + self.url_prefix, tuner_command_channel, tags) assert self._proc is not None self.port = port # port will be None if start up failed @@ -137,7 +136,7 @@ def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMo run_mode Running the experiment in foreground or background """ - self._start_impl(port, debug, run_mode, self.url_prefix, None, []) + self._start_impl(port, debug, run_mode, None, []) def _stop_impl(self) -> None: atexit.unregister(self.stop) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 6f4aeef3c3..2f81d781b5 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -249,7 +249,7 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None: raise ValueError(f'Unsupported engine type: {config.execution_engine}') set_execution_engine(engine) - def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None: + def start(self, *args, **kwargs) -> None: """ By design, the only different between `start` and `run` is that `start` is asynchronous, while `run` waits the experiment to complete. RetiariiExperiment always waits the experiment @@ -289,10 +289,10 @@ def run(self, self.strategy.run(base_model_ir, self.applied_mutators) else: ws_url = f'ws://localhost:{port}/tuner' - canonicalized_config = self._start_impl(port, debug, RunMode.Background, None, ws_url, ['retiarii']) + canonicalized_config = self._start_impl(port, debug, RunMode.Background, ws_url, ['retiarii']) canonicalized_config = cast(RetiariiExeConfig, canonicalized_config) self._dispatcher = RetiariiAdvisor(ws_url) - self._dispatcher_thread = Thread(target=self._dispatcher.run) + self._dispatcher_thread = Thread(target=self._dispatcher.run, daemon=True) self._dispatcher_thread.start() # FIXME: engine cannot be created twice self._create_execution_engine(canonicalized_config) From 6e9ca3533719b3d597c064256bc9b7290fddcf7a Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 23 May 2022 12:29:09 +0800 Subject: [PATCH 30/77] add comment --- ts/nni_manager/core/nnimanager.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts index 54a42760cc..8bce93e860 100644 --- a/ts/nni_manager/core/nnimanager.ts +++ b/ts/nni_manager/core/nnimanager.ts @@ -303,6 +303,9 @@ class NNIManager implements Manager { } this.trainingService.removeTrialJobMetricListener(this.trialJobMetricListener); + // NOTE: this sending TERMINATE should be out of the if clause, + // because when python dispatcher is started before nnimanager + // this.dispatcherPid would not have a valid value (i.e., not >0). this.dispatcher.sendCommand(TERMINATE); if (this.dispatcherPid > 0) { // gracefully terminate tuner and assessor here, wait at most 30 seconds. From 3e4a84a21dd1f9133faffd2336f3a53c980f9d0c Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 23 May 2022 21:20:50 +0800 Subject: [PATCH 31/77] fix bug --- nni/experiment/config/base.py | 5 +++++ nni/experiment/launcher.py | 1 + nni/retiarii/execution/base.py | 1 + nni/retiarii/execution/cgo_engine.py | 1 + nni/retiarii/experiment/config/experiment_config.py | 12 +++++++----- test/retiarii_test/cgo_mnasnet/base_mnasnet.py | 1 - test/retiarii_test/cgo_mnasnet/test.py | 8 +++----- 7 files changed, 18 insertions(+), 11 deletions(-) diff --git a/nni/experiment/config/base.py b/nni/experiment/config/base.py index f3d44e063f..5fb064800a 100644 --- a/nni/experiment/config/base.py +++ b/nni/experiment/config/base.py @@ -54,6 +54,11 @@ class ExperimentConfig(ConfigBase): Config objects will remember where they are loaded; therefore relative paths can be resolved smartly. If a config object is created with constructor, the base path will be current working directory. If it is loaded with ``ConfigBase.load(path)``, the base path will be ``path``'s parent. + + .. attention:: + + All the classes that inherit ``ConfigBase`` are not allowed to use ``from __future__ import annotations``, + because ``ConfigBase`` uses ``typeguard`` to perform runtime check and it does not support lazy annotations. """ def __init__(self, **kwargs): diff --git a/nni/experiment/launcher.py b/nni/experiment/launcher.py index eac6796e0e..dba5b026e5 100644 --- a/nni/experiment/launcher.py +++ b/nni/experiment/launcher.py @@ -137,6 +137,7 @@ def start_experiment( ) _logger.info('Setting up...') + print('zql: ', config.json()) rest.post(port, '/experiment', config.json(), url_prefix) except Exception as e: diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py index c35d357ad0..a45299065d 100644 --- a/nni/retiarii/execution/base.py +++ b/nni/retiarii/execution/base.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. + from __future__ import annotations import logging diff --git a/nni/retiarii/execution/cgo_engine.py b/nni/retiarii/execution/cgo_engine.py index f2d149a1d8..b959c54a4f 100644 --- a/nni/retiarii/execution/cgo_engine.py +++ b/nni/retiarii/execution/cgo_engine.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. + from __future__ import annotations import logging diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py index 945bf3704e..18869e90f0 100644 --- a/nni/retiarii/experiment/config/experiment_config.py +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -1,10 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from __future__ import annotations import os from dataclasses import dataclass -from typing import Any +from typing import Any, Union from nni.experiment.config import utils, ExperimentConfig @@ -32,10 +31,10 @@ class RetiariiExeConfig(ExperimentConfig): trial_code_directory: utils.PathLike = '.' trial_command: str = '_reserved' # new config field for NAS - execution_engine: str | ExecutionEngineConfig + execution_engine: Union[str, ExecutionEngineConfig] - def __init__(self, training_service_platform: str | None = None, - execution_engine: str | ExecutionEngineConfig = 'py', + def __init__(self, training_service_platform: Union[str, None] = None, + execution_engine: Union[str, ExecutionEngineConfig] = 'py', **kwargs): super().__init__(training_service_platform, **kwargs) self.execution_engine = execution_engine @@ -44,6 +43,7 @@ def _canonicalize(self, _parents): msg = '{} is not supposed to be set in Retiarii experiment by users, your config is {}.' if self.search_space != '': raise ValueError(msg.format('search_space', self.search_space)) + # TODO: maybe we should also allow users to specify trial_code_directory if str(self.trial_code_directory) != '.' and not os.path.isabs(self.trial_code_directory): raise ValueError(msg.format('trial_code_directory', self.trial_code_directory)) if self.trial_command != '_reserved' and \ @@ -53,6 +53,8 @@ def _canonicalize(self, _parents): if isinstance(self.execution_engine, str): self.execution_engine = execution_engine_config_factory(self.execution_engine) if self.execution_engine.name in ('py', 'base', 'cgo'): + # TODO: replace python3 with more elegant approach + # maybe use sys.executable rendered in trial side (e.g., trial_runner) self.trial_command = 'python3 -m nni.retiarii.trial_entry ' + self.execution_engine.name super()._canonicalize([self]) diff --git a/test/retiarii_test/cgo_mnasnet/base_mnasnet.py b/test/retiarii_test/cgo_mnasnet/base_mnasnet.py index 3e76d0bf7c..3cbb7f6c04 100644 --- a/test/retiarii_test/cgo_mnasnet/base_mnasnet.py +++ b/test/retiarii_test/cgo_mnasnet/base_mnasnet.py @@ -4,7 +4,6 @@ import torch import torch.nn as torch_nn -from torchvision.models.utils import load_state_dict_from_url import torch.nn.functional as F import sys diff --git a/test/retiarii_test/cgo_mnasnet/test.py b/test/retiarii_test/cgo_mnasnet/test.py index eac4956f3f..651591d514 100644 --- a/test/retiarii_test/cgo_mnasnet/test.py +++ b/test/retiarii_test/cgo_mnasnet/test.py @@ -8,7 +8,7 @@ from nni.retiarii import serialize from base_mnasnet import MNASNet from nni.experiment import RemoteMachineConfig -from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig +from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig, CgoEngineConfig from nni.retiarii.strategy import TPEStrategy from torchvision import transforms from torchvision.datasets import CIFAR10 @@ -59,8 +59,6 @@ exp_config.max_trial_number = 10 exp_config.trial_gpu_number = 1 exp_config.training_service.reuse_mode = True - exp_config.max_concurrency_cgo = 3 - exp_config.batch_waiting_time = 0 rm_conf = RemoteMachineConfig() rm_conf.host = '127.0.0.1' @@ -73,6 +71,6 @@ rm_conf.max_trial_number_per_gpu = 3 exp_config.training_service.machine_list = [rm_conf] - exp_config.execution_engine = 'cgo' + exp_config.execution_engine = CgoEngineConfig(max_concurrency_cgo = 3, batch_waiting_time = 0) - exp.run(exp_config, 8099) \ No newline at end of file + exp.run(exp_config, 8099) From b6876eb7431b7593b56c992538e58e122795be90 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 23 May 2022 21:23:16 +0800 Subject: [PATCH 32/77] remove print --- nni/experiment/launcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nni/experiment/launcher.py b/nni/experiment/launcher.py index dba5b026e5..eac6796e0e 100644 --- a/nni/experiment/launcher.py +++ b/nni/experiment/launcher.py @@ -137,7 +137,6 @@ def start_experiment( ) _logger.info('Setting up...') - print('zql: ', config.json()) rest.post(port, '/experiment', config.json(), url_prefix) except Exception as e: From 10553997a8d75d1c49d30b81e0efe33a1954c731 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 23 May 2022 21:39:24 +0800 Subject: [PATCH 33/77] remove trailing whitespace --- nni/experiment/config/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nni/experiment/config/base.py b/nni/experiment/config/base.py index 5fb064800a..ab8b6f0619 100644 --- a/nni/experiment/config/base.py +++ b/nni/experiment/config/base.py @@ -54,7 +54,7 @@ class ExperimentConfig(ConfigBase): Config objects will remember where they are loaded; therefore relative paths can be resolved smartly. If a config object is created with constructor, the base path will be current working directory. If it is loaded with ``ConfigBase.load(path)``, the base path will be ``path``'s parent. - + .. attention:: All the classes that inherit ``ConfigBase`` are not allowed to use ``from __future__ import annotations``, From e0be690fc298e17c1d94d587e973dd641b20c858 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 7 Jun 2022 11:24:51 +0800 Subject: [PATCH 34/77] fix not exist issue --- nni/runtime/msg_dispatcher_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py index 6873ff2409..e7717874eb 100644 --- a/nni/runtime/msg_dispatcher_base.py +++ b/nni/runtime/msg_dispatcher_base.py @@ -46,8 +46,8 @@ def __init__(self, command_channel_url=None): self._channel.connect() self.default_command_queue = Queue() self.assessor_command_queue = Queue() - self.default_worker = threading.Thread(target=self.command_queue_worker, args=(self.default_command_queue,)) - self.assessor_worker = threading.Thread(target=self.command_queue_worker, args=(self.assessor_command_queue,)) + self.default_worker = threading.Thread(target=self.command_queue_worker, args=(self.default_command_queue,), daemon=True) + self.assessor_worker = threading.Thread(target=self.command_queue_worker, args=(self.assessor_command_queue,), daemon=True) self.worker_exceptions = [] def run(self): From 811e44ec1e0f79733b47a509e009df6b7c6c3d77 Mon Sep 17 00:00:00 2001 From: quzha Date: Wed, 8 Jun 2022 15:04:21 +0800 Subject: [PATCH 35/77] add unittest --- test/ut/retiarii/test_multitrial.py | 44 +++++++++++++++++++++++++++++ test/ut/retiarii/test_oneshot.py | 13 +++++---- 2 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 test/ut/retiarii/test_multitrial.py diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py new file mode 100644 index 0000000000..2f4259bad8 --- /dev/null +++ b/test/ut/retiarii/test_multitrial.py @@ -0,0 +1,44 @@ +import argparse +import torch.nn.functional as F + +from nni.retiarii import strategy +from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment +from .test_oneshot import _mnist_net + + +def test_multi_trial(): + evaluator_kwargs = { + 'max_epochs': 1 + } + + to_test = [ + # (model, evaluator) + _mnist_net('simple', evaluator_kwargs), + _mnist_net('simple_value_choice', evaluator_kwargs), + _mnist_net('value_choice', evaluator_kwargs), + _mnist_net('repeat', evaluator_kwargs), + _mnist_net('custom_op', evaluator_kwargs), + ] + + for base_model, evaluator in to_test: + search_strategy = strategy.Random() + exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy) + exp_config = RetiariiExeConfig('local') + exp_config.experiment_name = 'mnist_unittest' + exp_config.trial_concurrency = 2 + exp_config.max_trial_number = 2 + exp_config.training_service.use_active_gpu = False + exp.run(exp_config, 8081) + assert isinstance(exp.export_top_models()[0], dict) + exp.stop() + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--exp', type=str, default='all', metavar='E', + help='experiment to run, default = all') + args = parser.parse_args() + + if args.exp == 'all': + test_multi_trial() + else: + globals()[f'test_{args.exp}']() diff --git a/test/ut/retiarii/test_oneshot.py b/test/ut/retiarii/test_oneshot.py index 68afb7204c..5b84b61773 100644 --- a/test/ut/retiarii/test_oneshot.py +++ b/test/ut/retiarii/test_oneshot.py @@ -7,6 +7,7 @@ from torchvision.datasets import MNIST from torch.utils.data import Dataset, RandomSampler +import nni import nni.retiarii.nn.pytorch as nn from nni.retiarii import strategy, model_wrapper, basic_unit from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment @@ -216,13 +217,13 @@ def _mnist_net(type_, evaluator_kwargs): raise ValueError(f'Unsupported type: {type_}') transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) - train_dataset = MNIST('data/mnist', train=True, download=True, transform=transform) + train_dataset = nni.trace(MNIST)('data/mnist', train=True, download=True, transform=transform) # Multi-GPU combined dataloader will break this subset sampler. Expected though. - train_random_sampler = RandomSampler(train_dataset, True, int(len(train_dataset) / 20)) - train_loader = DataLoader(train_dataset, 64, sampler=train_random_sampler) - valid_dataset = MNIST('data/mnist', train=False, download=True, transform=transform) - valid_random_sampler = RandomSampler(valid_dataset, True, int(len(valid_dataset) / 20)) - valid_loader = DataLoader(valid_dataset, 64, sampler=valid_random_sampler) + train_random_sampler = nni.trace(RandomSampler)(train_dataset, True, int(len(train_dataset) / 20)) + train_loader = nni.trace(DataLoader)(train_dataset, 64, sampler=train_random_sampler) + valid_dataset = nni.trace(MNIST)('data/mnist', train=False, download=True, transform=transform) + valid_random_sampler = nni.trace(RandomSampler)(valid_dataset, True, int(len(valid_dataset) / 20)) + valid_loader = nni.trace(DataLoader)(valid_dataset, 64, sampler=valid_random_sampler) evaluator = Classification(train_dataloader=train_loader, val_dataloaders=valid_loader, **evaluator_kwargs) return base_model, evaluator From bc849a1446dbb0604b1c80fd079a1a7c880d7040 Mon Sep 17 00:00:00 2001 From: quzha Date: Wed, 8 Jun 2022 17:14:41 +0800 Subject: [PATCH 36/77] add one more test --- test/ut/retiarii/test_multitrial.py | 31 +++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py index 2f4259bad8..689284858a 100644 --- a/test/ut/retiarii/test_multitrial.py +++ b/test/ut/retiarii/test_multitrial.py @@ -1,5 +1,6 @@ import argparse -import torch.nn.functional as F +import os +from subprocess import Popen from nni.retiarii import strategy from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment @@ -28,10 +29,35 @@ def test_multi_trial(): exp_config.trial_concurrency = 2 exp_config.max_trial_number = 2 exp_config.training_service.use_active_gpu = False - exp.run(exp_config, 8081) + exp.run(exp_config, 8080) assert isinstance(exp.export_top_models()[0], dict) exp.stop() +python_script = """ +from nni.retiarii import strategy +from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment +from test_oneshot import _mnist_net + +base_model, evaluator = _mnist_net('simple', {'max_epochs': 1}) +search_strategy = strategy.Random() +exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy) +exp_config = RetiariiExeConfig('local') +exp_config.experiment_name = 'mnist_unittest' +exp_config.trial_concurrency = 2 +exp_config.max_trial_number = 2 +exp_config.training_service.use_active_gpu = False +exp.run(exp_config, 8080) +assert isinstance(exp.export_top_models()[0], dict) +""" + +def test_exp_exit_without_stop(): + script_name = 'tmp_multi_trial.py' + with open(script_name, 'w') as f: + f.write(python_script) + proc = Popen(['python3', script_name]) + proc.wait() + os.remove(script_name) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--exp', type=str, default='all', metavar='E', @@ -40,5 +66,6 @@ def test_multi_trial(): if args.exp == 'all': test_multi_trial() + test_exp_exit_without_stop() else: globals()[f'test_{args.exp}']() From 77ae20b50b926a6a9ddf5b562987ef8fead5adb2 Mon Sep 17 00:00:00 2001 From: quzha Date: Wed, 8 Jun 2022 17:54:40 +0800 Subject: [PATCH 37/77] resolve comments --- nni/runtime/msg_dispatcher_base.py | 2 ++ test/ut/retiarii/test_multitrial.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py index e7717874eb..99e6c71c91 100644 --- a/nni/runtime/msg_dispatcher_base.py +++ b/nni/runtime/msg_dispatcher_base.py @@ -46,6 +46,8 @@ def __init__(self, command_channel_url=None): self._channel.connect() self.default_command_queue = Queue() self.assessor_command_queue = Queue() + # here daemon should be True, because their parent thread is configured as daemon to enable smooth exit of NAS experiment. + # if daemon is not set, these threads will block the daemon effect of their parent thread. self.default_worker = threading.Thread(target=self.command_queue_worker, args=(self.default_command_queue,), daemon=True) self.assessor_worker = threading.Thread(target=self.command_queue_worker, args=(self.assessor_command_queue,), daemon=True) self.worker_exceptions = [] diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py index 689284858a..a309fa2292 100644 --- a/test/ut/retiarii/test_multitrial.py +++ b/test/ut/retiarii/test_multitrial.py @@ -1,5 +1,6 @@ import argparse import os +import sys from subprocess import Popen from nni.retiarii import strategy @@ -54,7 +55,7 @@ def test_exp_exit_without_stop(): script_name = 'tmp_multi_trial.py' with open(script_name, 'w') as f: f.write(python_script) - proc = Popen(['python3', script_name]) + proc = Popen([sys.executable, script_name]) proc.wait() os.remove(script_name) From b664a0af9928b804701d807a84421bdf5ced730f Mon Sep 17 00:00:00 2001 From: quzha Date: Thu, 9 Jun 2022 15:08:00 +0800 Subject: [PATCH 38/77] update --- test/ut/retiarii/test_multitrial.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py index a309fa2292..467b6a6fdb 100644 --- a/test/ut/retiarii/test_multitrial.py +++ b/test/ut/retiarii/test_multitrial.py @@ -27,10 +27,11 @@ def test_multi_trial(): exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'mnist_unittest' - exp_config.trial_concurrency = 2 - exp_config.max_trial_number = 2 + exp_config.trial_concurrency = 1 + exp_config.max_trial_number = 1 exp_config.training_service.use_active_gpu = False exp.run(exp_config, 8080) + print(exp.export_top_models()) assert isinstance(exp.export_top_models()[0], dict) exp.stop() @@ -44,8 +45,8 @@ def test_multi_trial(): exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'mnist_unittest' -exp_config.trial_concurrency = 2 -exp_config.max_trial_number = 2 +exp_config.trial_concurrency = 1 +exp_config.max_trial_number = 1 exp_config.training_service.use_active_gpu = False exp.run(exp_config, 8080) assert isinstance(exp.export_top_models()[0], dict) From ecf87c32904b07993a822aa2ac74d0bd9c3f5ce6 Mon Sep 17 00:00:00 2001 From: quzha Date: Thu, 9 Jun 2022 15:54:20 +0800 Subject: [PATCH 39/77] fix pipeline --- test/ut/retiarii/test_multitrial.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py index 467b6a6fdb..2b604f179b 100644 --- a/test/ut/retiarii/test_multitrial.py +++ b/test/ut/retiarii/test_multitrial.py @@ -1,12 +1,15 @@ import argparse import os import sys +import pytorch_lightning as pl +import pytest from subprocess import Popen from nni.retiarii import strategy from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment from .test_oneshot import _mnist_net +pytestmark = pytest.mark.skipif(pl.__version__ < '1.0', reason='Incompatible APIs') def test_multi_trial(): evaluator_kwargs = { @@ -31,7 +34,6 @@ def test_multi_trial(): exp_config.max_trial_number = 1 exp_config.training_service.use_active_gpu = False exp.run(exp_config, 8080) - print(exp.export_top_models()) assert isinstance(exp.export_top_models()[0], dict) exp.stop() From 49fa868d15b17e5ad1d611aeecda30dc3e5a7da2 Mon Sep 17 00:00:00 2001 From: quzha Date: Fri, 10 Jun 2022 09:45:40 +0800 Subject: [PATCH 40/77] add timeout for one test --- test/ut/retiarii/test_multitrial.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/ut/retiarii/test_multitrial.py b/test/ut/retiarii/test_multitrial.py index 2b604f179b..04812368da 100644 --- a/test/ut/retiarii/test_multitrial.py +++ b/test/ut/retiarii/test_multitrial.py @@ -54,6 +54,7 @@ def test_multi_trial(): assert isinstance(exp.export_top_models()[0], dict) """ +@pytest.mark.timeout(600) def test_exp_exit_without_stop(): script_name = 'tmp_multi_trial.py' with open(script_name, 'w') as f: From fc99b433e72eaf73fe20d8741d0672d9e57a6e38 Mon Sep 17 00:00:00 2001 From: quzha Date: Wed, 15 Jun 2022 15:42:11 +0800 Subject: [PATCH 41/77] release note --- docs/source/release.rst | 57 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/docs/source/release.rst b/docs/source/release.rst index 1d7cb53956..b499df3552 100644 --- a/docs/source/release.rst +++ b/docs/source/release.rst @@ -5,6 +5,63 @@ Change Log ========== +Release 2.8 - 6/16/2022 +----------------------- + +Neural Architecture Search +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Align user experience of one-shot NAS with multi-trial NAS, i.e., users can use one-shot NAS by specifying the corresponding strategy +* *Preview* Support load/retrain the pre-searched model of some search spaces, i.e., 18 models in 4 different search spaces +* Support AutoFormer search space in search space hub, thanks our collaborators xxx, xxx +* Support multi-GPU training of one-shot NAS +* One-shot NAS supports the NAS API `repeat` and `cell` +* Refactor of RetiariiExperiment to share the common implementation with HPO experiment +* CGO supports pytorch-lightning 1.6 + +Model Compression +^^^^^^^^^^^^^^^^^ + +* *Preview* Refactor and improvement of automatic model compress with a new `CompressionExperiment` +* Support customizating module replacement function for unsupported modules in model speedup +* Support the module replacement function for some user mentioned modules +* Support output_padding for contransposed2d in model speedup, thanks external contributor xxx + +Hyper-Parameter Optimization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Make config.tuner.name case insensitive +* Allow writing configurations of advisor in tuner format, i.e., aligning the configuration of advisor and tuner + +Experiment +^^^^^^^^^^ + +* Support launching multiple HPO experiments in one process +* Refactor of the logging mechanism in NNI +* Refactor of NNI manager globals for flexible and high externsibility +* Migrate dispatcher IPC to WebSocket +* Decouple lock stuffs from experiments manager logic +* Use launcher's sys.executable to detect Python interpreter + +WebUI +^^^^^ + +* Improve user experience of trial ordering in the overview page +* Fix the update issue in the trial detail page + +Documentation +^^^^^^^^^^^^^ + +* A new translation framework for document + +Notable Bugfixes +^^^^^^^^^^^^^^^^ + +* Fix TPE import issue for old metrics +* Fix the issue in TPE nested search space +* Support `RecursiveScriptModule` in speedup +* Fix the issue of failed "implicit type cast" in merge_parameter() + Release 2.7 - 4/18/2022 ----------------------- From a94cdf9a769b7803f6a0e7711fe3a8beb8b52fb6 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 21 Jun 2022 18:56:26 +0800 Subject: [PATCH 42/77] resolve comments --- README.md | 2 +- docs/source/conf.py | 2 +- docs/source/release.rst | 29 ++++++++++++++++------------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index f5009384e2..53fb2d5ccf 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ NNI automates feature engineering, neural architecture search, hyperparameter tu ## What's NEW!   -* **New release**: [v2.7 is available](https://github.com/microsoft/nni/releases/tag/v2.7) - _released on Apr-18-2022_ +* **New release**: [v2.8 is available](https://github.com/microsoft/nni/releases/tag/v2.8) - _released on Apr-18-2022_ * **New demo available**: [Youtube entry](https://www.youtube.com/channel/UCKcafm6861B2mnYhPbZHavw) | [Bilibili 入口](https://space.bilibili.com/1649051673) - _last updated on Apr-18-2022_ * **New webinar**: [Introducing Retiarii: A deep learning exploratory-training framework on NNI](https://note.microsoft.com/MSR-Webinar-Retiarii-Registration-Live.html) - _scheduled on June-24-2021_ * **Newly upgraded documentation**: [Doc upgraded](https://nni.readthedocs.io/en/stable) diff --git a/docs/source/conf.py b/docs/source/conf.py index 882e70b01f..3b996be064 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -31,7 +31,7 @@ version = '' # The full version, including alpha/beta/rc tags # FIXME: this should be written somewhere globally -release = 'v2.7' +release = 'v2.8' # -- General configuration --------------------------------------------------- diff --git a/docs/source/release.rst b/docs/source/release.rst index b499df3552..aade4a97c0 100644 --- a/docs/source/release.rst +++ b/docs/source/release.rst @@ -5,43 +5,45 @@ Change Log ========== -Release 2.8 - 6/16/2022 +Release 2.8 - 6/21/2022 ----------------------- Neural Architecture Search ^^^^^^^^^^^^^^^^^^^^^^^^^^ * Align user experience of one-shot NAS with multi-trial NAS, i.e., users can use one-shot NAS by specifying the corresponding strategy -* *Preview* Support load/retrain the pre-searched model of some search spaces, i.e., 18 models in 4 different search spaces -* Support AutoFormer search space in search space hub, thanks our collaborators xxx, xxx * Support multi-GPU training of one-shot NAS -* One-shot NAS supports the NAS API `repeat` and `cell` +* *Preview* Support load/retrain the pre-searched model of some search spaces, i.e., 18 models in 4 different search spaces +* Support AutoFormer search space in search space hub, thanks our collaborators @nbl97 and @penghouwen +* One-shot NAS supports the NAS API ``repeat`` and ``cell`` * Refactor of RetiariiExperiment to share the common implementation with HPO experiment * CGO supports pytorch-lightning 1.6 Model Compression ^^^^^^^^^^^^^^^^^ -* *Preview* Refactor and improvement of automatic model compress with a new `CompressionExperiment` +* *Preview* Refactor and improvement of automatic model compress with a new ``CompressionExperiment`` * Support customizating module replacement function for unsupported modules in model speedup * Support the module replacement function for some user mentioned modules -* Support output_padding for contransposed2d in model speedup, thanks external contributor xxx +* Support output_padding for contransposed2d in model speedup, thanks external contributor @haoshuai-orka Hyper-Parameter Optimization ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Make config.tuner.name case insensitive +* Make ``config.tuner.name`` case insensitive * Allow writing configurations of advisor in tuner format, i.e., aligning the configuration of advisor and tuner Experiment ^^^^^^^^^^ * Support launching multiple HPO experiments in one process -* Refactor of the logging mechanism in NNI -* Refactor of NNI manager globals for flexible and high externsibility -* Migrate dispatcher IPC to WebSocket -* Decouple lock stuffs from experiments manager logic -* Use launcher's sys.executable to detect Python interpreter +* Internal refactors and improvements + + * Refactor of the logging mechanism in NNI + * Refactor of NNI manager globals for flexible and high extensibility + * Migrate dispatcher IPC to WebSocket + * Decouple lock stuffs from experiments manager logic + * Use launcher's sys.executable to detect Python interpreter WebUI ^^^^^ @@ -53,13 +55,14 @@ Documentation ^^^^^^^^^^^^^ * A new translation framework for document +* Add a new quantization demo (`doc <>`__) Notable Bugfixes ^^^^^^^^^^^^^^^^ * Fix TPE import issue for old metrics * Fix the issue in TPE nested search space -* Support `RecursiveScriptModule` in speedup +* Support ``RecursiveScriptModule`` in speedup * Fix the issue of failed "implicit type cast" in merge_parameter() Release 2.7 - 4/18/2022 From 57a03f7cccefdd640973676dd6912726449a337a Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 21 Jun 2022 19:16:53 +0800 Subject: [PATCH 43/77] add doc links --- docs/source/release.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/release.rst b/docs/source/release.rst index aade4a97c0..9eba337d5f 100644 --- a/docs/source/release.rst +++ b/docs/source/release.rst @@ -11,9 +11,9 @@ Release 2.8 - 6/21/2022 Neural Architecture Search ^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Align user experience of one-shot NAS with multi-trial NAS, i.e., users can use one-shot NAS by specifying the corresponding strategy +* Align user experience of one-shot NAS with multi-trial NAS, i.e., users can use one-shot NAS by specifying the corresponding strategy (`doc `__) * Support multi-GPU training of one-shot NAS -* *Preview* Support load/retrain the pre-searched model of some search spaces, i.e., 18 models in 4 different search spaces +* *Preview* Support load/retrain the pre-searched model of some search spaces, i.e., 18 models in 4 different search spaces (`doc `__) * Support AutoFormer search space in search space hub, thanks our collaborators @nbl97 and @penghouwen * One-shot NAS supports the NAS API ``repeat`` and ``cell`` * Refactor of RetiariiExperiment to share the common implementation with HPO experiment @@ -55,7 +55,7 @@ Documentation ^^^^^^^^^^^^^ * A new translation framework for document -* Add a new quantization demo (`doc <>`__) +* Add a new quantization demo (`doc `__) Notable Bugfixes ^^^^^^^^^^^^^^^^ From 09ba2c52a608bc979aee8ab98edb864b65ef06e1 Mon Sep 17 00:00:00 2001 From: quzha Date: Wed, 22 Jun 2022 08:30:06 +0800 Subject: [PATCH 44/77] update --- README.md | 4 ++-- docs/source/release.rst | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 53fb2d5ccf..b384ef1299 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,8 @@ NNI automates feature engineering, neural architecture search, hyperparameter tu ## What's NEW!   -* **New release**: [v2.8 is available](https://github.com/microsoft/nni/releases/tag/v2.8) - _released on Apr-18-2022_ -* **New demo available**: [Youtube entry](https://www.youtube.com/channel/UCKcafm6861B2mnYhPbZHavw) | [Bilibili 入口](https://space.bilibili.com/1649051673) - _last updated on Apr-18-2022_ +* **New release**: [v2.8 is available](https://github.com/microsoft/nni/releases/tag/v2.8) - _released on June-22-2022_ +* **New demo available**: [Youtube entry](https://www.youtube.com/channel/UCKcafm6861B2mnYhPbZHavw) | [Bilibili 入口](https://space.bilibili.com/1649051673) - _last updated on June-22-2022_ * **New webinar**: [Introducing Retiarii: A deep learning exploratory-training framework on NNI](https://note.microsoft.com/MSR-Webinar-Retiarii-Registration-Live.html) - _scheduled on June-24-2021_ * **Newly upgraded documentation**: [Doc upgraded](https://nni.readthedocs.io/en/stable) diff --git a/docs/source/release.rst b/docs/source/release.rst index 9eba337d5f..c27575257c 100644 --- a/docs/source/release.rst +++ b/docs/source/release.rst @@ -5,7 +5,7 @@ Change Log ========== -Release 2.8 - 6/21/2022 +Release 2.8 - 6/22/2022 ----------------------- Neural Architecture Search @@ -23,9 +23,9 @@ Model Compression ^^^^^^^^^^^^^^^^^ * *Preview* Refactor and improvement of automatic model compress with a new ``CompressionExperiment`` -* Support customizating module replacement function for unsupported modules in model speedup +* Support customizating module replacement function for unsupported modules in model speedup (`doc `__) * Support the module replacement function for some user mentioned modules -* Support output_padding for contransposed2d in model speedup, thanks external contributor @haoshuai-orka +* Support output_padding for convtranspose2d in model speedup, thanks external contributor @haoshuai-orka Hyper-Parameter Optimization ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 72f3ff8500a5c0e6a2c4654c246f2f0827bdbd83 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 5 Jul 2022 22:36:57 +0800 Subject: [PATCH 45/77] nas experiment view --- nni/experiment/config/experiment_config.py | 1 + nni/experiment/config/utils/internal.py | 19 +++++- nni/experiment/launcher.py | 3 +- .../experiment/config/experiment_config.py | 59 +++++++++++-------- nni/tools/nnictl/launcher.py | 2 +- 5 files changed, 55 insertions(+), 29 deletions(-) diff --git a/nni/experiment/config/experiment_config.py b/nni/experiment/config/experiment_config.py index a9bae4c0da..e5124832dd 100644 --- a/nni/experiment/config/experiment_config.py +++ b/nni/experiment/config/experiment_config.py @@ -61,6 +61,7 @@ class ExperimentConfig(ConfigBase): # In latter case hybrid training services can have different settings. experiment_name: Optional[str] = None + experiment_type: str = 'hpo' search_space_file: Optional[utils.PathLike] = None search_space: Any = None trial_command: Optional[str] = None # training service field diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py index d6e55ece01..1862fffc83 100644 --- a/nni/experiment/config/utils/internal.py +++ b/nni/experiment/config/utils/internal.py @@ -15,7 +15,7 @@ 'fields', 'is_instance', 'validate_type', 'is_path_like', 'guess_config_type', 'guess_list_config_type', 'training_service_config_factory', 'load_training_service_config', - 'get_ipv4_address' + 'get_ipv4_address', 'init_experiment_config' ] import copy @@ -197,3 +197,20 @@ def get_ipv4_address() -> str: addr = s.getsockname()[0] s.close() return addr + +def init_experiment_config(config_json) -> ConfigBase: + from ..experiment_config import ExperimentConfig + from nni.retiarii.experiment.config.experiment_config import RetiariiExeConfig + if 'experimentType' in config_json: + if config_json['experimentType'] == 'hpo': + return ExperimentConfig(**config_json) + elif config_json['experimentType'] == 'nas': + return RetiariiExeConfig(**config_json) + else: + raise KeyError(f'Unknown experiment_type: {config_json["experimentType"]}') + else: + # for backward compatibility, experiment config <= v2.8 does not have "experiment_type" + if 'executionEngine' in config_json: + return RetiariiExeConfig(**config_json) + else: + return ExperimentConfig(**config_json) diff --git a/nni/experiment/launcher.py b/nni/experiment/launcher.py index 8ad250a59b..9c50bd92e3 100644 --- a/nni/experiment/launcher.py +++ b/nni/experiment/launcher.py @@ -18,6 +18,7 @@ from typing_extensions import Literal from .config import ExperimentConfig +from .config.utils import init_experiment_config from . import rest from ..tools.nnictl.config_utils import Experiments, Config from ..tools.nnictl.nnictl_utils import update_experiment @@ -203,7 +204,7 @@ def _save_experiment_information(experiment_id: str, port: int, start_time: int, def get_stopped_experiment_config(exp_id, exp_dir=None): config_json = get_stopped_experiment_config_json(exp_id, exp_dir) # type: ignore - config = ExperimentConfig(**config_json) # type: ignore + config = init_experiment_config(config_json) # type: ignore if exp_dir and not os.path.samefile(exp_dir, config.experiment_working_directory): msg = 'Experiment working directory provided in command line (%s) is different from experiment config (%s)' _logger.warning(msg, exp_dir, config.experiment_working_directory) diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py index 4eb5ac82a0..e79347cd29 100644 --- a/nni/retiarii/experiment/config/experiment_config.py +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -28,6 +28,7 @@ def _get_ee_config_class(engine_name): @dataclass(init=False) class RetiariiExeConfig(ExperimentConfig): # FIXME: refactor this class to inherit from a new common base class with HPO config + experiment_type: str = 'nas' search_space: Any = '' trial_code_directory: utils.PathLike = '.' trial_command: str = '_reserved' @@ -43,33 +44,39 @@ def __init__(self, training_service_platform: Union[str, None] = None, **kwargs): super().__init__(training_service_platform, **kwargs) self.execution_engine = execution_engine + + self._is_complete_config = False + if self.search_space != '' and self.trial_code_directory != '.' and self.trial_command != '_reserved': + # only experiment view and resume have complete config in init, as the config is directly loaded + self._is_complete_config = True def _canonicalize(self, _parents): - msg = '{} is not supposed to be set in Retiarii experiment by users, your config is {}.' - if self.search_space != '': - raise ValueError(msg.format('search_space', self.search_space)) - # TODO: maybe we should also allow users to specify trial_code_directory - if str(self.trial_code_directory) != '.' and not os.path.isabs(self.trial_code_directory): - raise ValueError(msg.format('trial_code_directory', self.trial_code_directory)) - - trial_command_tmpl = '{envs} {python} -m nni.retiarii.trial_entry {execution_engine}' - if self.trial_command != '_reserved' and '-m nni.retiarii.trial_entry' not in self.trial_command: - raise ValueError(msg.format('trial_command', self.trial_command)) - - if isinstance(self.execution_engine, str): - self.execution_engine = execution_engine_config_factory(self.execution_engine) - - _trial_command_params = { - # Default variables - 'envs': '', - # TODO: maybe use sys.executable rendered in trial side (e.g., trial_runner) - 'python': sys.executable, - 'execution_engine': self.execution_engine.name, - - # This should override the parameters above. - **(self._trial_command_params or {}) - } - - self.trial_command = trial_command_tmpl.format(**_trial_command_params).strip() + if not self._is_complete_config: + msg = '{} is not supposed to be set in Retiarii experiment by users, your config is {}.' + if self.search_space != '': + raise ValueError(msg.format('search_space', self.search_space)) + # TODO: maybe we should also allow users to specify trial_code_directory + if str(self.trial_code_directory) != '.' and not os.path.isabs(self.trial_code_directory): + raise ValueError(msg.format('trial_code_directory', self.trial_code_directory)) + + trial_command_tmpl = '{envs} {python} -m nni.retiarii.trial_entry {execution_engine}' + if self.trial_command != '_reserved' and '-m nni.retiarii.trial_entry' not in self.trial_command: + raise ValueError(msg.format('trial_command', self.trial_command)) + + if isinstance(self.execution_engine, str): + self.execution_engine = execution_engine_config_factory(self.execution_engine) + + _trial_command_params = { + # Default variables + 'envs': '', + # TODO: maybe use sys.executable rendered in trial side (e.g., trial_runner) + 'python': sys.executable, + 'execution_engine': self.execution_engine.name, + + # This should override the parameters above. + **(self._trial_command_params or {}) + } + + self.trial_command = trial_command_tmpl.format(**_trial_command_params).strip() super()._canonicalize([self]) diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py index af10717654..64adc7fe1c 100644 --- a/nni/tools/nnictl/launcher.py +++ b/nni/tools/nnictl/launcher.py @@ -119,4 +119,4 @@ def view_experiment(args): exit() exp = Experiment._view(exp_id, exp_dir) - exp.start(port, run_mode=RunMode.Detach) + exp.start(port, run_mode=RunMode.Detach) \ No newline at end of file From e0342eb708d761c2a1e963ec3eea5f09336ae9c1 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 5 Jul 2022 22:45:28 +0800 Subject: [PATCH 46/77] minor --- nni/tools/nnictl/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py index 64adc7fe1c..af10717654 100644 --- a/nni/tools/nnictl/launcher.py +++ b/nni/tools/nnictl/launcher.py @@ -119,4 +119,4 @@ def view_experiment(args): exit() exp = Experiment._view(exp_id, exp_dir) - exp.start(port, run_mode=RunMode.Detach) \ No newline at end of file + exp.start(port, run_mode=RunMode.Detach) From 99374f3756139126d4a32e54b3210d3374918cc4 Mon Sep 17 00:00:00 2001 From: quzha Date: Thu, 7 Jul 2022 20:01:49 +0800 Subject: [PATCH 47/77] support nas experiment resume --- nni/retiarii/execution/utils.py | 37 +++++++++++++- .../experiment/config/experiment_config.py | 35 ++++++++++--- nni/retiarii/experiment/pytorch.py | 51 +++++++++---------- nni/retiarii/integration.py | 4 ++ 4 files changed, 90 insertions(+), 37 deletions(-) diff --git a/nni/retiarii/execution/utils.py b/nni/retiarii/execution/utils.py index 7615fb9988..5404cab2fc 100644 --- a/nni/retiarii/execution/utils.py +++ b/nni/retiarii/execution/utils.py @@ -1,8 +1,15 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from typing import Any, List +from typing import Any, List, cast + +from nni.experiment.config.training_services import RemoteConfig +from .interface import AbstractExecutionEngine from ..graph import Model +from ..experiment.config import ( + BaseEngineConfig, PyEngineConfig, + CgoEngineConfig, BenchmarkEngineConfig +) def _unpack_if_only_one(ele: List[Any]): if len(ele) == 1: @@ -26,3 +33,31 @@ def mutation_dict_to_summary(mutation: dict) -> dict: def get_mutation_summary(model: Model) -> dict: mutation = get_mutation_dict(model) return mutation_dict_to_summary(mutation) + +def init_execution_engine(config, port, url_prefix) -> AbstractExecutionEngine: + if isinstance(config.execution_engine, BaseEngineConfig): + from .base import BaseExecutionEngine + return BaseExecutionEngine(port, url_prefix) + elif isinstance(config.execution_engine, CgoEngineConfig): + from .cgo_engine import CGOExecutionEngine + + assert not isinstance(config.training_service, list) \ + and config.training_service.platform == 'remote', \ + "CGO execution engine currently only supports remote training service" + assert config.execution_engine.batch_waiting_time is not None \ + and config.execution_engine.max_concurrency_cgo is not None + return CGOExecutionEngine(cast(RemoteConfig, config.training_service), + max_concurrency=config.execution_engine.max_concurrency_cgo, + batch_waiting_time=config.execution_engine.batch_waiting_time, + rest_port=port, + rest_url_prefix=url_prefix) + elif isinstance(config.execution_engine, PyEngineConfig): + from .python import PurePythonExecutionEngine + return PurePythonExecutionEngine(port, url_prefix) + elif isinstance(config.execution_engine, BenchmarkEngineConfig): + from .benchmark import BenchmarkExecutionEngine + assert config.execution_engine.benchmark is not None, \ + '"benchmark" must be set when benchmark execution engine is used.' + return BenchmarkExecutionEngine(config.execution_engine.benchmark) + else: + raise ValueError(f'Unsupported engine type: {config.execution_engine}') \ No newline at end of file diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py index e79347cd29..b6f0149d09 100644 --- a/nni/retiarii/experiment/config/experiment_config.py +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -3,8 +3,8 @@ import os import sys -from dataclasses import dataclass -from typing import Any, Dict, Union, Optional +from dataclasses import dataclass, MISSING +from typing import Any, Dict, Union, Optional, overload from nni.experiment.config import utils, ExperimentConfig @@ -12,12 +12,20 @@ __all__ = ['RetiariiExeConfig'] -def execution_engine_config_factory(engine_name): - # FIXME: may move this function to experiment utils in future +# TODO: may move this function to experiment utils in future +def init_execution_engine_config(engine_config: Union[str, dict]) -> ExecutionEngineConfig: + if isinstance(engine_config, str): + engine_name = engine_config + else: + engine_name = engine_config['name'] cls = _get_ee_config_class(engine_name) if cls is None: raise ValueError(f'Invalid execution engine name: {engine_name}') - return cls() + engine = cls() + if isinstance(engine_config, dict): + for key, value in engine_config.items(): + setattr(engine, key, value) + return engine def _get_ee_config_class(engine_name): for cls in ExecutionEngineConfig.__subclasses__(): @@ -43,8 +51,17 @@ def __init__(self, training_service_platform: Union[str, None] = None, execution_engine: Union[str, ExecutionEngineConfig] = 'py', **kwargs): super().__init__(training_service_platform, **kwargs) - self.execution_engine = execution_engine - + + if self.execution_engine != MISSING: + # this branch means kwargs is not {} and self.execution_engine has been assigned in super(), + # reassign it because super() may instantiate ExecutionEngineConfig by mistake + self.execution_engine = init_execution_engine_config(kwargs['executionEngine']) + del kwargs['executionEngine'] + elif isinstance(execution_engine, str): + self.execution_engine = init_execution_engine_config(execution_engine) + else: + self.execution_engine = execution_engine + self._is_complete_config = False if self.search_space != '' and self.trial_code_directory != '.' and self.trial_command != '_reserved': # only experiment view and resume have complete config in init, as the config is directly loaded @@ -63,8 +80,10 @@ def _canonicalize(self, _parents): if self.trial_command != '_reserved' and '-m nni.retiarii.trial_entry' not in self.trial_command: raise ValueError(msg.format('trial_command', self.trial_command)) + # this canonicalize is necessary because users may assign new execution engine str + # after execution engine config is instantiated if isinstance(self.execution_engine, str): - self.execution_engine = execution_engine_config_factory(self.execution_engine) + self.execution_engine = init_execution_engine_config(self.execution_engine) _trial_command_params = { # Default variables diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 8dd6118842..31d7d550b3 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -14,6 +14,7 @@ import torch import torch.nn as nn from nni.experiment import Experiment, RunMode +from nni.experiment import launcher from nni.experiment.config.training_services import RemoteConfig from .config import ( @@ -24,7 +25,7 @@ from ..converter import convert_to_graph from ..converter.graph_gen import GraphConverterWithShape from ..execution import list_models, set_execution_engine -from ..execution.utils import get_mutation_dict +from ..execution.utils import get_mutation_dict, init_execution_engine from ..graph import Evaluator from ..integration import RetiariiAdvisor from ..mutator import Mutator @@ -220,33 +221,7 @@ def _run_strategy(self, config: RetiariiExeConfig): # TODO: find out a proper way to show no more trial message on WebUI def _create_execution_engine(self, config: RetiariiExeConfig) -> None: - #TODO: we will probably need a execution engine factory to make this clean and elegant - if isinstance(config.execution_engine, BaseEngineConfig): - from ..execution.base import BaseExecutionEngine - engine = BaseExecutionEngine(self.port, self.url_prefix) - elif isinstance(config.execution_engine, CgoEngineConfig): - from ..execution.cgo_engine import CGOExecutionEngine - - assert not isinstance(config.training_service, list) \ - and config.training_service.platform == 'remote', \ - "CGO execution engine currently only supports remote training service" - assert config.execution_engine.batch_waiting_time is not None \ - and config.execution_engine.max_concurrency_cgo is not None - engine = CGOExecutionEngine(cast(RemoteConfig, config.training_service), - max_concurrency=config.execution_engine.max_concurrency_cgo, - batch_waiting_time=config.execution_engine.batch_waiting_time, - rest_port=self.port, - rest_url_prefix=self.url_prefix) - elif isinstance(config.execution_engine, PyEngineConfig): - from ..execution.python import PurePythonExecutionEngine - engine = PurePythonExecutionEngine(self.port, self.url_prefix) - elif isinstance(config.execution_engine, BenchmarkEngineConfig): - from ..execution.benchmark import BenchmarkExecutionEngine - assert config.execution_engine.benchmark is not None, \ - '"benchmark" must be set when benchmark execution engine is used.' - engine = BenchmarkExecutionEngine(config.execution_engine.benchmark) - else: - raise ValueError(f'Unsupported engine type: {config.execution_engine}') + engine = init_execution_engine(config, self.port, self.url_prefix) set_execution_engine(engine) def start(self, *args, **kwargs) -> None: @@ -360,3 +335,23 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for return [model_to_pytorch_script(model) for model in all_models[:top_k]] elif formatter == 'dict': return [get_mutation_dict(model) for model in all_models[:top_k]] + + def resume(self, experiment_id: str, port: int = 8080, wait_completion: bool = True, debug: bool = False): + """ + Resume a stopped experiment. + + Parameters + ---------- + experiment_id + The stopped experiment id. + port + The port of web UI. + wait_completion + If true, run in the foreground. If false, run in the background. + debug + Whether to start in debug mode. + """ + self.id = experiment_id + self._action = 'resume' + config = launcher.get_stopped_experiment_config(experiment_id, None) + self.run(config, port=port, debug=debug) \ No newline at end of file diff --git a/nni/retiarii/integration.py b/nni/retiarii/integration.py index 26f479727f..f56f66308d 100644 --- a/nni/retiarii/integration.py +++ b/nni/retiarii/integration.py @@ -184,3 +184,7 @@ def _process_value(value) -> Any: # hopefully a float else: return value return value + + def handle_import_data(self, data): + # FIXME: ignore imported data for now, as strategy has not supported resume + pass From cbaad0f213a024e3c7374a780d024f364c2ca0ed Mon Sep 17 00:00:00 2001 From: quzha Date: Thu, 7 Jul 2022 20:12:34 +0800 Subject: [PATCH 48/77] fix pylint --- nni/retiarii/experiment/config/experiment_config.py | 2 +- nni/retiarii/experiment/pytorch.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/nni/retiarii/experiment/config/experiment_config.py b/nni/retiarii/experiment/config/experiment_config.py index b6f0149d09..dc07518513 100644 --- a/nni/retiarii/experiment/config/experiment_config.py +++ b/nni/retiarii/experiment/config/experiment_config.py @@ -4,7 +4,7 @@ import os import sys from dataclasses import dataclass, MISSING -from typing import Any, Dict, Union, Optional, overload +from typing import Any, Dict, Union, Optional from nni.experiment.config import utils, ExperimentConfig diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 31d7d550b3..7c22554d8f 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -15,7 +15,6 @@ import torch.nn as nn from nni.experiment import Experiment, RunMode from nni.experiment import launcher -from nni.experiment.config.training_services import RemoteConfig from .config import ( RetiariiExeConfig, OneshotEngineConfig, BaseEngineConfig, From 0abe5a3c8452b721f41b443f2df31014cadaa947 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 11 Jul 2022 10:31:44 +0800 Subject: [PATCH 49/77] finish main functionality --- nni/experiment/config/utils/internal.py | 18 +++- nni/retiarii/experiment/pytorch.py | 116 ++++++++++++++++++++---- nni/retiarii/graph.py | 10 +- nni/tools/nnictl/launcher.py | 19 +++- 4 files changed, 138 insertions(+), 25 deletions(-) diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py index 1862fffc83..4e6e11934c 100644 --- a/nni/experiment/config/utils/internal.py +++ b/nni/experiment/config/utils/internal.py @@ -15,7 +15,7 @@ 'fields', 'is_instance', 'validate_type', 'is_path_like', 'guess_config_type', 'guess_list_config_type', 'training_service_config_factory', 'load_training_service_config', - 'get_ipv4_address', 'init_experiment_config' + 'get_ipv4_address', 'init_experiment_config', 'get_experiment_class_using_config' ] import copy @@ -214,3 +214,19 @@ def init_experiment_config(config_json) -> ConfigBase: return RetiariiExeConfig(**config_json) else: return ExperimentConfig(**config_json) + +def get_experiment_class_using_config(config_json): + from ...experiment import Experiment + from nni.retiarii.experiment.pytorch import RetiariiExperiment + if 'experimentType' in config_json: + if config_json['experimentType'] == 'hpo': + return Experiment + elif config_json['experimentType'] == 'nas': + return RetiariiExperiment + else: + raise KeyError(f'Unknown experiment_type: {config_json["experimentType"]}') + else: + if 'executionEngine' in config_json: + return RetiariiExperiment + else: + return Experiment \ No newline at end of file diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 7c22554d8f..93ed0c649c 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -5,6 +5,8 @@ import logging +import os +import time import warnings from threading import Thread from typing import Any, List, Union, cast @@ -13,6 +15,7 @@ import torch import torch.nn as nn +from nni.common import dump, load from nni.experiment import Experiment, RunMode from nni.experiment import launcher @@ -25,7 +28,7 @@ from ..converter.graph_gen import GraphConverterWithShape from ..execution import list_models, set_execution_engine from ..execution.utils import get_mutation_dict, init_execution_engine -from ..graph import Evaluator +from ..graph import Evaluator, Model from ..integration import RetiariiAdvisor from ..mutator import Mutator from ..nn.pytorch.mutator import ( @@ -35,6 +38,7 @@ from ..serializer import is_model_wrapped from ..strategy import BaseStrategy from ..strategy.utils import dry_run_for_formatted_search_space +from nni.retiarii import strategy _logger = logging.getLogger(__name__) @@ -186,7 +190,8 @@ def __init__(self, base_model: nn.Module, 'Please consider specifying it as a positional argument, or use `evaluator`.', DeprecationWarning) evaluator = trainer - if evaluator is None: + # base_model is None means the experiment is in resume or view mode + if base_model is not None and evaluator is None: raise ValueError('Evaluator should not be none.') self.base_model = base_model @@ -204,18 +209,11 @@ def __init__(self, base_model: nn.Module, 'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL, RuntimeWarning) - def _run_strategy(self, config: RetiariiExeConfig): - base_model_ir, self.applied_mutators = preprocess_model( - self.base_model, self.evaluator, self.applied_mutators, - full_ir=not isinstance(config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)), - dummy_input=config.execution_engine.dummy_input - if isinstance(config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None - ) - + def _run_strategy(self, base_model_ir: Model, applied_mutators: List[Mutator]) -> None: _logger.info('Start strategy...') - search_space = dry_run_for_formatted_search_space(base_model_ir, self.applied_mutators) + search_space = dry_run_for_formatted_search_space(base_model_ir, applied_mutators) self.update_search_space(search_space) - self.strategy.run(base_model_ir, self.applied_mutators) + self.strategy.run(base_model_ir, applied_mutators) _logger.info('Strategy exit') # TODO: find out a proper way to show no more trial message on WebUI @@ -223,6 +221,29 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None: engine = init_execution_engine(config, self.port, self.url_prefix) set_execution_engine(engine) + def _save_experiment_checkpoint(self, + base_model_ir, + applied_mutators, + strategy) -> None: + ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint') + with open(os.path.join(ckp_path, 'nas_model'), 'w') as fp: + dump(base_model_ir._dump(), fp, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024))) + with open(os.path.join(ckp_path, 'applied_mutators'), 'w') as fp: + dump(applied_mutators, fp) + with open(os.path.join(ckp_path, 'strategy'), 'w') as fp: + dump(strategy, fp) + + def _load_experiment_checkpoint(self): + ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint') + with open(os.path.join(ckp_path, 'nas_model'), 'r') as fp: + base_model_ir = load(fp=fp) + base_model_ir = Model._load(base_model_ir) + with open(os.path.join(ckp_path, 'applied_mutators'), 'r') as fp: + applied_mutators = load(fp=fp) + with open(os.path.join(ckp_path, 'strategy'), 'r') as fp: + strategy = load(fp=fp) + return base_model_ir, applied_mutators, strategy + def start(self, *args, **kwargs) -> None: """ By design, the only different between `start` and `run` is that `start` is asynchronous, @@ -271,7 +292,20 @@ def run(self, # FIXME: engine cannot be created twice self._create_execution_engine(canonicalized_config) try: - self._run_strategy(canonicalized_config) + if self._action == 'create': + base_model_ir, self.applied_mutators = preprocess_model( + self.base_model, self.evaluator, self.applied_mutators, + full_ir=not isinstance(canonicalized_config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)), + dummy_input=canonicalized_config.execution_engine.dummy_input + if isinstance(canonicalized_config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None + ) + self._save_experiment_checkpoint(base_model_ir, self.applied_mutators, self.strategy) + elif self._action == 'resume': + base_model_ir, self.applied_mutators, self.strategy = self._load_experiment_checkpoint() + else: + raise RuntimeError(f'The experiment mode "{self._action}" is not supposed to invoke run() method.') + + self._run_strategy(base_model_ir, self.applied_mutators) # FIXME: move this logic to strategy with a new API provided by execution engine self._wait_completion() except KeyboardInterrupt: @@ -335,7 +369,36 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for elif formatter == 'dict': return [get_mutation_dict(model) for model in all_models[:top_k]] - def resume(self, experiment_id: str, port: int = 8080, wait_completion: bool = True, debug: bool = False): + @staticmethod + def view(experiment_id: str, port: int = 8080, non_blocking: bool = False): + """ + View a stopped experiment. + + Parameters + ---------- + experiment_id + The stopped experiment id. + port + The port of web UI. + non_blocking + If false, run in the foreground. If true, run in the background. + """ + experiment = RetiariiExperiment._view(experiment_id) + # view is nothing specific about RetiariiExperiment, directly using the method in base experiment class + super(RetiariiExperiment, experiment).start(port=port, debug=False, run_mode=RunMode.Detach) + if non_blocking: + return experiment + else: + try: + while True: + time.sleep(10) + except KeyboardInterrupt: + _logger.warning('KeyboardInterrupt detected') + finally: + experiment.stop() + + @staticmethod + def resume(experiment_id: str, port: int = 8080, wait_completion: bool = True, debug: bool = False): """ Resume a stopped experiment. @@ -350,7 +413,24 @@ def resume(self, experiment_id: str, port: int = 8080, wait_completion: bool = T debug Whether to start in debug mode. """ - self.id = experiment_id - self._action = 'resume' - config = launcher.get_stopped_experiment_config(experiment_id, None) - self.run(config, port=port, debug=debug) \ No newline at end of file + experiment = RetiariiExperiment._resume(experiment_id) + experiment.run(experiment.config, port=port, debug=debug) + # always return experiment for user's follow-up operations on the experiment + # wait_completion is not necessary as nas experiment is always in foreground + return experiment + + @staticmethod + def _resume(exp_id, exp_dir=None): + exp = RetiariiExperiment(None) + exp.id = exp_id + exp._action = 'resume' + exp.config = launcher.get_stopped_experiment_config(exp_id, exp_dir) + return exp + + @staticmethod + def _view(exp_id, exp_dir=None): + exp = RetiariiExperiment(None) + exp.id = exp_id + exp._action = 'view' + exp.config = launcher.get_stopped_experiment_config(exp_id, exp_dir) + return exp \ No newline at end of file diff --git a/nni/retiarii/graph.py b/nni/retiarii/graph.py index 0a99f81f18..fc988f45eb 100644 --- a/nni/retiarii/graph.py +++ b/nni/retiarii/graph.py @@ -13,6 +13,7 @@ from typing import (TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Type, Union, cast, overload) +from nni.common import dump, load if TYPE_CHECKING: from .mutator import Mutator @@ -172,14 +173,21 @@ def fork(self) -> 'Model': def _load(ir: Any) -> 'Model': model = Model(_internal=True) for graph_name, graph_data in ir.items(): - if graph_name != '_evaluator': + if graph_name not in ['_evaluator', 'model_id', 'python_class', 'python_init_params']: Graph._load(model, graph_name, graph_data)._register() + model.model_id = ir['model_id'] + model.python_class = ir['python_class'] + model.python_init_params = ir['python_init_params'] if '_evaluator' in ir: model.evaluator = Evaluator._load(ir['_evaluator']) return model def _dump(self) -> Any: ret = {name: graph._dump() for name, graph in self.graphs.items()} + # NOTE: only dump some necessary member variable, will be refactored + ret['model_id'] = self.model_id + ret['python_class'] = self.python_class + ret['python_init_params'] = self.python_init_params if self.evaluator is not None: ret['_evaluator'] = self.evaluator._dump() return ret diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py index af10717654..20fe346ebc 100644 --- a/nni/tools/nnictl/launcher.py +++ b/nni/tools/nnictl/launcher.py @@ -11,6 +11,7 @@ from nni.experiment import Experiment, RunMode from nni.experiment.config import ExperimentConfig, convert, utils +from nni.retiarii.experiment.pytorch import RetiariiExperiment from nni.tools.annotation import expand_annotations, generate_search_space # used for v1-only legacy setup, remove them later @@ -104,9 +105,13 @@ def resume_experiment(args): legacy_launcher.resume_experiment(args) exit() - exp = Experiment._resume(exp_id, exp_dir) - run_mode = RunMode.Foreground if foreground else RunMode.Detach - exp.start(port, debug, run_mode) + exp_class = utils.get_experiment_class_using_config(config_json) + if exp_class is RetiariiExperiment: + RetiariiExperiment.resume(exp_id, port, True, debug) + else: + exp = Experiment._resume(exp_id, exp_dir) + run_mode = RunMode.Foreground if foreground else RunMode.Detach + exp.start(port, debug, run_mode) def view_experiment(args): exp_id = args.id @@ -118,5 +123,9 @@ def view_experiment(args): legacy_launcher.view_experiment(args) exit() - exp = Experiment._view(exp_id, exp_dir) - exp.start(port, run_mode=RunMode.Detach) + exp_class = utils.get_experiment_class_using_config(config_json) + if exp_class is RetiariiExperiment: + RetiariiExperiment.view(exp_id, port, non_blocking=True) + else: + exp = Experiment._view(exp_id, exp_dir) + exp.start(port, run_mode=RunMode.Detach) From 811ade74cc01fbe1db1ee221fbc0220206215e67 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 11 Jul 2022 10:54:07 +0800 Subject: [PATCH 50/77] fix pylint --- nni/retiarii/experiment/pytorch.py | 11 +++++------ nni/retiarii/graph.py | 1 - 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index 93ed0c649c..dbf5eb62b3 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -9,7 +9,7 @@ import time import warnings from threading import Thread -from typing import Any, List, Union, cast +from typing import Any, List, Union, cast, Tuple import colorama @@ -38,7 +38,6 @@ from ..serializer import is_model_wrapped from ..strategy import BaseStrategy from ..strategy.utils import dry_run_for_formatted_search_space -from nni.retiarii import strategy _logger = logging.getLogger(__name__) @@ -222,9 +221,9 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None: set_execution_engine(engine) def _save_experiment_checkpoint(self, - base_model_ir, - applied_mutators, - strategy) -> None: + base_model_ir: Model, + applied_mutators: List[Mutator], + strategy: BaseStrategy) -> None: ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint') with open(os.path.join(ckp_path, 'nas_model'), 'w') as fp: dump(base_model_ir._dump(), fp, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024))) @@ -233,7 +232,7 @@ def _save_experiment_checkpoint(self, with open(os.path.join(ckp_path, 'strategy'), 'w') as fp: dump(strategy, fp) - def _load_experiment_checkpoint(self): + def _load_experiment_checkpoint(self) -> Tuple[Model, List[Mutator], BaseStrategy]: ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint') with open(os.path.join(ckp_path, 'nas_model'), 'r') as fp: base_model_ir = load(fp=fp) diff --git a/nni/retiarii/graph.py b/nni/retiarii/graph.py index fc988f45eb..df542c99f5 100644 --- a/nni/retiarii/graph.py +++ b/nni/retiarii/graph.py @@ -13,7 +13,6 @@ from typing import (TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Type, Union, cast, overload) -from nni.common import dump, load if TYPE_CHECKING: from .mutator import Mutator From 7b2d042e648473314ba52f81dc07966107dcb661 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 11 Jul 2022 15:39:26 +0800 Subject: [PATCH 51/77] fix pyright --- nni/experiment/config/utils/internal.py | 6 +++--- nni/retiarii/experiment/pytorch.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py index 4e6e11934c..a17befe947 100644 --- a/nni/experiment/config/utils/internal.py +++ b/nni/experiment/config/utils/internal.py @@ -34,7 +34,9 @@ from .public import is_missing if typing.TYPE_CHECKING: + from nni.retiarii.experiment.config.experiment_config import RetiariiExeConfig from ..base import ConfigBase + from ..experiment_config import ExperimentConfig from ..training_service import TrainingServiceConfig ## handle relative path ## @@ -198,9 +200,7 @@ def get_ipv4_address() -> str: s.close() return addr -def init_experiment_config(config_json) -> ConfigBase: - from ..experiment_config import ExperimentConfig - from nni.retiarii.experiment.config.experiment_config import RetiariiExeConfig +def init_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]: if 'experimentType' in config_json: if config_json['experimentType'] == 'hpo': return ExperimentConfig(**config_json) diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index dbf5eb62b3..f9586cd1d5 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -224,7 +224,8 @@ def _save_experiment_checkpoint(self, base_model_ir: Model, applied_mutators: List[Mutator], strategy: BaseStrategy) -> None: - ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint') + ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), + self.id, 'checkpoint') with open(os.path.join(ckp_path, 'nas_model'), 'w') as fp: dump(base_model_ir._dump(), fp, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024))) with open(os.path.join(ckp_path, 'applied_mutators'), 'w') as fp: @@ -233,7 +234,8 @@ def _save_experiment_checkpoint(self, dump(strategy, fp) def _load_experiment_checkpoint(self) -> Tuple[Model, List[Mutator], BaseStrategy]: - ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), self.id, 'checkpoint') + ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), + self.id, 'checkpoint') with open(os.path.join(ckp_path, 'nas_model'), 'r') as fp: base_model_ir = load(fp=fp) base_model_ir = Model._load(base_model_ir) From 36103d821d0cf33be11a3790c2c37f8d4d0205a5 Mon Sep 17 00:00:00 2001 From: quzha Date: Sat, 6 Aug 2022 16:09:38 +0800 Subject: [PATCH 52/77] update --- nni/nas/experiment/pytorch.py | 9 +++++---- nni/tools/nnictl/launcher.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py index 77a86365c9..c5f62afb41 100644 --- a/nni/nas/experiment/pytorch.py +++ b/nni/nas/experiment/pytorch.py @@ -18,7 +18,6 @@ import torch.nn as nn from nni.common import dump, load from nni.experiment import Experiment, RunMode, launcher -from nni.experiment.config.training_services import RemoteConfig from nni.nas.execution import list_models, set_execution_engine from nni.nas.execution.common import RetiariiAdvisor, get_mutation_dict, init_execution_engine, Model @@ -173,10 +172,10 @@ class RetiariiExperiment(Experiment): """ def __init__(self, base_model: nn.Module, - evaluator: Union[BaseOneShotTrainer, Evaluator] = cast(Evaluator, None), + evaluator: Evaluator = cast(Evaluator, None), applied_mutators: List[Mutator] = cast(List[Mutator], None), strategy: BaseStrategy = cast(BaseStrategy, None), - trainer: BaseOneShotTrainer = cast(BaseOneShotTrainer, None)): + trainer: Any = None): super().__init__(None) self.config: RetiariiExeConfig = cast(RetiariiExeConfig, None) @@ -190,7 +189,7 @@ def __init__(self, base_model: nn.Module, raise ValueError('Evaluator should not be none.') self.base_model = base_model - self.evaluator: Union[Evaluator, BaseOneShotTrainer] = evaluator + self.evaluator: Evaluator = evaluator self.applied_mutators = applied_mutators self.strategy = strategy @@ -257,6 +256,7 @@ def run(self, Run the experiment. This function will block until experiment finish or error. """ + from nni.retiarii.oneshot.interface import BaseOneShotTrainer if isinstance(self.evaluator, BaseOneShotTrainer): # TODO: will throw a deprecation warning soon # warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. ' @@ -349,6 +349,7 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for config = self.config.canonical_copy() assert not isinstance(config.execution_engine, PyEngineConfig), \ 'You should use `dict` formatter when using Python execution engine.' + from nni.retiarii.oneshot.interface import BaseOneShotTrainer if isinstance(self.evaluator, BaseOneShotTrainer): assert top_k == 1, 'Only support top_k is 1 for now.' return self.evaluator.export() diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py index 20fe346ebc..b718434cd9 100644 --- a/nni/tools/nnictl/launcher.py +++ b/nni/tools/nnictl/launcher.py @@ -11,7 +11,7 @@ from nni.experiment import Experiment, RunMode from nni.experiment.config import ExperimentConfig, convert, utils -from nni.retiarii.experiment.pytorch import RetiariiExperiment +from nni.nas.experiment.pytorch import RetiariiExperiment from nni.tools.annotation import expand_annotations, generate_search_space # used for v1-only legacy setup, remove them later From 12334d6e53cfc0a8be9e25fb84a25b2d7c7f0720 Mon Sep 17 00:00:00 2001 From: quzha Date: Sat, 6 Aug 2022 17:08:59 +0800 Subject: [PATCH 53/77] resolve comments --- nni/experiment/config/experiment_config.py | 3 ++- nni/experiment/config/utils/internal.py | 11 +++++++---- nni/experiment/launcher.py | 4 ++-- nni/nas/execution/common/utils.py | 10 +++++----- nni/nas/experiment/config/experiment_config.py | 3 ++- nni/nas/experiment/pytorch.py | 2 +- 6 files changed, 19 insertions(+), 14 deletions(-) diff --git a/nni/experiment/config/experiment_config.py b/nni/experiment/config/experiment_config.py index e5124832dd..1d301c1515 100644 --- a/nni/experiment/config/experiment_config.py +++ b/nni/experiment/config/experiment_config.py @@ -12,6 +12,7 @@ import logging from pathlib import Path from typing import Any, List, Optional, Union +from typing_extensions import Literal import yaml @@ -61,7 +62,7 @@ class ExperimentConfig(ConfigBase): # In latter case hybrid training services can have different settings. experiment_name: Optional[str] = None - experiment_type: str = 'hpo' + experiment_type: Literal['hpo'] = 'hpo' search_space_file: Optional[utils.PathLike] = None search_space: Any = None trial_command: Optional[str] = None # training service field diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py index 4cea82e01e..6edaac4cfd 100644 --- a/nni/experiment/config/utils/internal.py +++ b/nni/experiment/config/utils/internal.py @@ -15,7 +15,8 @@ 'fields', 'is_instance', 'validate_type', 'is_path_like', 'guess_config_type', 'guess_list_config_type', 'training_service_config_factory', 'load_training_service_config', - 'get_ipv4_address', 'init_experiment_config', 'get_experiment_class_using_config' + 'load_experiment_config', 'get_experiment_class_using_config', + 'get_ipv4_address' ] import copy @@ -34,7 +35,7 @@ from .public import is_missing if typing.TYPE_CHECKING: - from nni.retiarii.experiment.config.experiment_config import RetiariiExeConfig + from nni.nas.experiment.config import RetiariiExeConfig from ..base import ConfigBase from ..experiment_config import ExperimentConfig from ..training_service import TrainingServiceConfig @@ -201,7 +202,9 @@ def get_ipv4_address() -> str: s.close() return addr -def init_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]: +def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]: + from nni.nas.experiment.config import RetiariiExeConfig + from ..experiment_config import ExperimentConfig if 'experimentType' in config_json: if config_json['experimentType'] == 'hpo': return ExperimentConfig(**config_json) @@ -218,7 +221,7 @@ def init_experiment_config(config_json) -> typing.Union[ExperimentConfig, Retiar def get_experiment_class_using_config(config_json): from ...experiment import Experiment - from nni.retiarii.experiment.pytorch import RetiariiExperiment + from nni.nas.experiment.pytorch import RetiariiExperiment if 'experimentType' in config_json: if config_json['experimentType'] == 'hpo': return Experiment diff --git a/nni/experiment/launcher.py b/nni/experiment/launcher.py index 9c50bd92e3..64b99feb46 100644 --- a/nni/experiment/launcher.py +++ b/nni/experiment/launcher.py @@ -18,7 +18,7 @@ from typing_extensions import Literal from .config import ExperimentConfig -from .config.utils import init_experiment_config +from .config.utils import load_experiment_config from . import rest from ..tools.nnictl.config_utils import Experiments, Config from ..tools.nnictl.nnictl_utils import update_experiment @@ -204,7 +204,7 @@ def _save_experiment_information(experiment_id: str, port: int, start_time: int, def get_stopped_experiment_config(exp_id, exp_dir=None): config_json = get_stopped_experiment_config_json(exp_id, exp_dir) # type: ignore - config = init_experiment_config(config_json) # type: ignore + config = load_experiment_config(config_json) # type: ignore if exp_dir and not os.path.samefile(exp_dir, config.experiment_working_directory): msg = 'Experiment working directory provided in command line (%s) is different from experiment config (%s)' _logger.warning(msg, exp_dir, config.experiment_working_directory) diff --git a/nni/nas/execution/common/utils.py b/nni/nas/execution/common/utils.py index 553abe580f..e92f3e61ac 100644 --- a/nni/nas/execution/common/utils.py +++ b/nni/nas/execution/common/utils.py @@ -1,16 +1,12 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -__all__ = ['unpack_if_only_one', 'get_mutation_dict', 'mutation_dict_to_summary', 'get_mutation_summary'] +__all__ = ['unpack_if_only_one', 'get_mutation_dict', 'mutation_dict_to_summary', 'get_mutation_summary', 'init_execution_engine'] from typing import Any, List, cast from nni.experiment.config.training_services import RemoteConfig from .engine import AbstractExecutionEngine from .graph import Model -from ...experiment.config import ( - BaseEngineConfig, PyEngineConfig, - CgoEngineConfig, BenchmarkEngineConfig -) def unpack_if_only_one(ele: List[Any]): @@ -40,6 +36,10 @@ def get_mutation_summary(model: Model) -> dict: return mutation_dict_to_summary(mutation) def init_execution_engine(config, port, url_prefix) -> AbstractExecutionEngine: + from ...experiment.config import ( + BaseEngineConfig, PyEngineConfig, + CgoEngineConfig, BenchmarkEngineConfig + ) if isinstance(config.execution_engine, BaseEngineConfig): from ..pytorch.graph import BaseExecutionEngine return BaseExecutionEngine(port, url_prefix) diff --git a/nni/nas/experiment/config/experiment_config.py b/nni/nas/experiment/config/experiment_config.py index af1d4c824c..61f24ba759 100644 --- a/nni/nas/experiment/config/experiment_config.py +++ b/nni/nas/experiment/config/experiment_config.py @@ -5,6 +5,7 @@ import sys from dataclasses import dataclass, MISSING from typing import Any, Dict, Union, Optional +from typing_extensions import Literal from nni.experiment.config import utils, ExperimentConfig @@ -36,7 +37,7 @@ def _get_ee_config_class(engine_name): @dataclass(init=False) class RetiariiExeConfig(ExperimentConfig): # FIXME: refactor this class to inherit from a new common base class with HPO config - experiment_type: str = 'nas' + experiment_type: Literal['nas'] = 'nas' search_space: Any = '' trial_code_directory: utils.PathLike = '.' trial_command: str = '_reserved' diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py index c5f62afb41..b85e3265fc 100644 --- a/nni/nas/experiment/pytorch.py +++ b/nni/nas/experiment/pytorch.py @@ -10,7 +10,7 @@ import time import warnings from threading import Thread -from typing import Any, List, cast, Union, Tuple +from typing import Any, List, cast, Tuple import colorama From 49b1dc4668ece4ff0981629bc68500f14c4552b3 Mon Sep 17 00:00:00 2001 From: quzha Date: Sat, 6 Aug 2022 17:33:24 +0800 Subject: [PATCH 54/77] minor --- nni/experiment/config/utils/internal.py | 9 +++++---- nni/nas/execution/common/integration.py | 2 +- nni/tools/nnictl/launcher.py | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py index 6edaac4cfd..e7b6c80a47 100644 --- a/nni/experiment/config/utils/internal.py +++ b/nni/experiment/config/utils/internal.py @@ -15,7 +15,7 @@ 'fields', 'is_instance', 'validate_type', 'is_path_like', 'guess_config_type', 'guess_list_config_type', 'training_service_config_factory', 'load_training_service_config', - 'load_experiment_config', 'get_experiment_class_using_config', + 'load_experiment_config', 'get_experiment_cls_using_config', 'get_ipv4_address' ] @@ -203,6 +203,7 @@ def get_ipv4_address() -> str: return addr def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]: + # avoid circular import from nni.nas.experiment.config import RetiariiExeConfig from ..experiment_config import ExperimentConfig if 'experimentType' in config_json: @@ -219,9 +220,9 @@ def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, Retiar else: return ExperimentConfig(**config_json) -def get_experiment_class_using_config(config_json): - from ...experiment import Experiment +def get_experiment_cls_using_config(config_json): from nni.nas.experiment.pytorch import RetiariiExperiment + from ...experiment import Experiment if 'experimentType' in config_json: if config_json['experimentType'] == 'hpo': return Experiment @@ -233,4 +234,4 @@ def get_experiment_class_using_config(config_json): if 'executionEngine' in config_json: return RetiariiExperiment else: - return Experiment \ No newline at end of file + return Experiment diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py index 4c56c390dd..ce04b79f5a 100644 --- a/nni/nas/execution/common/integration.py +++ b/nni/nas/execution/common/integration.py @@ -236,4 +236,4 @@ def _process_value(value) -> Any: # hopefully a float def handle_import_data(self, data): # FIXME: ignore imported data for now, as strategy has not supported resume - pass \ No newline at end of file + pass diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py index b718434cd9..8e7fc33f62 100644 --- a/nni/tools/nnictl/launcher.py +++ b/nni/tools/nnictl/launcher.py @@ -105,7 +105,7 @@ def resume_experiment(args): legacy_launcher.resume_experiment(args) exit() - exp_class = utils.get_experiment_class_using_config(config_json) + exp_class = utils.get_experiment_cls_using_config(config_json) if exp_class is RetiariiExperiment: RetiariiExperiment.resume(exp_id, port, True, debug) else: @@ -123,7 +123,7 @@ def view_experiment(args): legacy_launcher.view_experiment(args) exit() - exp_class = utils.get_experiment_class_using_config(config_json) + exp_class = utils.get_experiment_cls_using_config(config_json) if exp_class is RetiariiExperiment: RetiariiExperiment.view(exp_id, port, non_blocking=True) else: From adf87a855e405c7e8bf67e9b2975673d553f67a5 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 9 Aug 2022 13:57:50 +0800 Subject: [PATCH 55/77] add ut --- nni/nas/execution/common/utils.py | 2 +- .../experiment/config/experiment_config.py | 2 +- nni/nas/experiment/pytorch.py | 24 +++++------ test/ut/nas/test_experiment.py | 42 +++++++++++++++++++ 4 files changed, 56 insertions(+), 14 deletions(-) diff --git a/nni/nas/execution/common/utils.py b/nni/nas/execution/common/utils.py index e92f3e61ac..96693f3564 100644 --- a/nni/nas/execution/common/utils.py +++ b/nni/nas/execution/common/utils.py @@ -65,4 +65,4 @@ def init_execution_engine(config, port, url_prefix) -> AbstractExecutionEngine: '"benchmark" must be set when benchmark execution engine is used.' return BenchmarkExecutionEngine(config.execution_engine.benchmark) else: - raise ValueError(f'Unsupported engine type: {config.execution_engine}') \ No newline at end of file + raise ValueError(f'Unsupported engine type: {config.execution_engine}') diff --git a/nni/nas/experiment/config/experiment_config.py b/nni/nas/experiment/config/experiment_config.py index 61f24ba759..83f3d6c1c4 100644 --- a/nni/nas/experiment/config/experiment_config.py +++ b/nni/nas/experiment/config/experiment_config.py @@ -53,7 +53,7 @@ def __init__(self, training_service_platform: Union[str, None] = None, **kwargs): super().__init__(training_service_platform, **kwargs) - if self.execution_engine != MISSING: + if not utils.is_missing(self.execution_engine): # this branch means kwargs is not {} and self.execution_engine has been assigned in super(), # reassign it because super() may instantiate ExecutionEngineConfig by mistake self.execution_engine = init_execution_engine_config(kwargs['executionEngine']) diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py index b85e3265fc..d6c4b64cd7 100644 --- a/nni/nas/experiment/pytorch.py +++ b/nni/nas/experiment/pytorch.py @@ -171,7 +171,7 @@ class RetiariiExperiment(Experiment): ... final_model = Net() """ - def __init__(self, base_model: nn.Module, + def __init__(self, base_model: nn.Module = cast(nn.Module, None), evaluator: Evaluator = cast(Evaluator, None), applied_mutators: List[Mutator] = cast(List[Mutator], None), strategy: BaseStrategy = cast(BaseStrategy, None), @@ -185,8 +185,15 @@ def __init__(self, base_model: nn.Module, evaluator = trainer # base_model is None means the experiment is in resume or view mode - if base_model is not None and evaluator is None: - raise ValueError('Evaluator should not be none.') + if base_model is not None: + if evaluator is None: + raise ValueError('Evaluator should not be none.') + # check for sanity + if not is_model_wrapped(base_model): + warnings.warn(colorama.Style.BRIGHT + colorama.Fore.RED + + '`@model_wrapper` is missing for the base model. The experiment might still be able to run, ' + 'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL, + RuntimeWarning) self.base_model = base_model self.evaluator: Evaluator = evaluator @@ -196,13 +203,6 @@ def __init__(self, base_model: nn.Module, self._dispatcher = None self._dispatcher_thread = None - # check for sanity - if not is_model_wrapped(base_model): - warnings.warn(colorama.Style.BRIGHT + colorama.Fore.RED + - '`@model_wrapper` is missing for the base model. The experiment might still be able to run, ' - 'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL, - RuntimeWarning) - def _run_strategy(self, base_model_ir: Model, applied_mutators: List[Mutator]) -> None: _logger.info('Start strategy...') search_space = dry_run_for_formatted_search_space(base_model_ir, applied_mutators) @@ -422,7 +422,7 @@ def _resume(exp_id, exp_dir=None): exp = RetiariiExperiment(None) exp.id = exp_id exp._action = 'resume' - exp.config = launcher.get_stopped_experiment_config(exp_id, exp_dir) + exp.config = cast(RetiariiExeConfig, launcher.get_stopped_experiment_config(exp_id, exp_dir)) return exp @staticmethod @@ -430,5 +430,5 @@ def _view(exp_id, exp_dir=None): exp = RetiariiExperiment(None) exp.id = exp_id exp._action = 'view' - exp.config = launcher.get_stopped_experiment_config(exp_id, exp_dir) + exp.config = cast(RetiariiExeConfig, launcher.get_stopped_experiment_config(exp_id, exp_dir)) return exp \ No newline at end of file diff --git a/test/ut/nas/test_experiment.py b/test/ut/nas/test_experiment.py index 991e3b23f6..28ea8edc00 100644 --- a/test/ut/nas/test_experiment.py +++ b/test/ut/nas/test_experiment.py @@ -1,4 +1,6 @@ +from asyncio import subprocess import os +import subprocess import sys import nni @@ -112,6 +114,46 @@ def test_multitrial_experiment(pytestconfig): assert isinstance(exp.export_top_models()[0], dict) exp.stop() +def test_multitrial_experiment_resume_view(pytestconfig): + # start a normal nas experiment + base_model = Net() + evaluator = get_mnist_evaluator() + search_strategy = strategy.Random() + exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy) + exp_id = exp.id + exp_config = RetiariiExeConfig('local') + exp_config.trial_concurrency = 1 + exp_config.max_trial_number = 1 + exp_config._trial_command_params = nas_experiment_trial_params(pytestconfig.rootpath) + exp.run(exp_config) + ensure_success(exp) + assert isinstance(exp.export_top_models()[0], dict) + exp.stop() + + # resume the above nas experiment. only tested the resume logic in the python side, + # as no more trial is executed after resume, the above experiment is already finished + print('python api resume...') + exp = RetiariiExperiment.resume(exp_id) + ensure_success(exp) + # TODO: currently `export_top_models` does not work as strategy's states are not resumed + # assert isinstance(exp.export_top_models()[0], dict) + exp.stop() + # view the above experiment in non blocking mode then stop it + print('python api view...') + exp = RetiariiExperiment.view(exp_id, non_blocking=True) + exp.stop() + + # the following is nnictl resume and view + print('nnictl resume...') + new_env = os.environ.copy() + new_env['PYTHONPATH'] = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)) + proc = subprocess.run(f'nnictl resume {exp_id}', shell=True, env=new_env) + assert proc.returncode == 0, 'resume nas experiment failed with code %d' % proc.returncode + print('nnictl view...') + proc = subprocess.run(f'nnictl view {exp_id}', shell=True) + assert proc.returncode == 0, 'view nas experiment failed with code %d' % proc.returncode + proc = subprocess.run(f'nnictl stop {exp_id}', shell=True) + assert proc.returncode == 0, 'stop viewed nas experiment failed with code %d' % proc.returncode def test_oneshot_experiment(): base_model = Net() From 6ecfd3ac5d4acbf6d73f8b2ddd837e9b5076fa38 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 9 Aug 2022 13:59:35 +0800 Subject: [PATCH 56/77] minor --- nni/nas/experiment/config/experiment_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nni/nas/experiment/config/experiment_config.py b/nni/nas/experiment/config/experiment_config.py index 83f3d6c1c4..7a39357b1b 100644 --- a/nni/nas/experiment/config/experiment_config.py +++ b/nni/nas/experiment/config/experiment_config.py @@ -3,7 +3,7 @@ import os import sys -from dataclasses import dataclass, MISSING +from dataclasses import dataclass from typing import Any, Dict, Union, Optional from typing_extensions import Literal From 8f20b2f4920a5a2a243cec42c1655f8fb602afec Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 9 Aug 2022 14:13:27 +0800 Subject: [PATCH 57/77] fix pylint --- nni/nas/experiment/config/experiment_config.py | 2 +- nni/nas/experiment/pytorch.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nni/nas/experiment/config/experiment_config.py b/nni/nas/experiment/config/experiment_config.py index 7a39357b1b..de55158c45 100644 --- a/nni/nas/experiment/config/experiment_config.py +++ b/nni/nas/experiment/config/experiment_config.py @@ -99,4 +99,4 @@ def _canonicalize(self, _parents): self.trial_command = trial_command_tmpl.format(**_trial_command_params).strip() - super()._canonicalize([self]) \ No newline at end of file + super()._canonicalize([self]) diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py index d6c4b64cd7..d7baa71891 100644 --- a/nni/nas/experiment/pytorch.py +++ b/nni/nas/experiment/pytorch.py @@ -419,7 +419,7 @@ def resume(experiment_id: str, port: int = 8080, wait_completion: bool = True, d @staticmethod def _resume(exp_id, exp_dir=None): - exp = RetiariiExperiment(None) + exp = RetiariiExperiment(cast(nn.Module, None)) exp.id = exp_id exp._action = 'resume' exp.config = cast(RetiariiExeConfig, launcher.get_stopped_experiment_config(exp_id, exp_dir)) @@ -427,8 +427,8 @@ def _resume(exp_id, exp_dir=None): @staticmethod def _view(exp_id, exp_dir=None): - exp = RetiariiExperiment(None) + exp = RetiariiExperiment(cast(nn.Module, None)) exp.id = exp_id exp._action = 'view' exp.config = cast(RetiariiExeConfig, launcher.get_stopped_experiment_config(exp_id, exp_dir)) - return exp \ No newline at end of file + return exp From ece0771ef6d1be6c5e6fd58c7458299734ee099a Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 9 Aug 2022 15:02:22 +0800 Subject: [PATCH 58/77] fix ut --- nni/nas/execution/common/graph.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nni/nas/execution/common/graph.py b/nni/nas/execution/common/graph.py index c413a07959..b1d7f64acc 100644 --- a/nni/nas/execution/common/graph.py +++ b/nni/nas/execution/common/graph.py @@ -126,9 +126,10 @@ def _load(ir: Any) -> 'Model': for graph_name, graph_data in ir.items(): if graph_name not in ['_evaluator', 'model_id', 'python_class', 'python_init_params']: Graph._load(model, graph_name, graph_data)._register() - model.model_id = ir['model_id'] - model.python_class = ir['python_class'] - model.python_init_params = ir['python_init_params'] + if 'model_id' in ir: # backward compatibility + model.model_id = ir['model_id'] + model.python_class = ir['python_class'] + model.python_init_params = ir['python_init_params'] if '_evaluator' in ir: model.evaluator = Evaluator._load(ir['_evaluator']) return model From bb7180428ebd4328c08ffd83c197eaffb74f11b6 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 9 Aug 2022 15:41:17 +0800 Subject: [PATCH 59/77] fix ut --- nni/nas/experiment/pytorch.py | 4 +--- test/ut/experiment/test_exp_config.py | 2 ++ test/ut/experiment/test_ts_remote.py | 2 ++ test/ut/nas/test_graph.py | 4 ++++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py index d7baa71891..cec8b62945 100644 --- a/nni/nas/experiment/pytorch.py +++ b/nni/nas/experiment/pytorch.py @@ -396,7 +396,7 @@ def view(experiment_id: str, port: int = 8080, non_blocking: bool = False): experiment.stop() @staticmethod - def resume(experiment_id: str, port: int = 8080, wait_completion: bool = True, debug: bool = False): + def resume(experiment_id: str, port: int = 8080, debug: bool = False): """ Resume a stopped experiment. @@ -406,8 +406,6 @@ def resume(experiment_id: str, port: int = 8080, wait_completion: bool = True, d The stopped experiment id. port The port of web UI. - wait_completion - If true, run in the foreground. If false, run in the background. debug Whether to start in debug mode. """ diff --git a/test/ut/experiment/test_exp_config.py b/test/ut/experiment/test_exp_config.py index 972f8076e5..5048a4322c 100644 --- a/test/ut/experiment/test_exp_config.py +++ b/test/ut/experiment/test_exp_config.py @@ -28,6 +28,7 @@ def expand_path(path): minimal_class.tuner.name = 'random' minimal_canon = { + 'experimentType': 'hpo', 'searchSpace': {'a': 1}, 'trialCommand': 'python main.py', 'trialCodeDirectory': os.path.realpath('.'), @@ -54,6 +55,7 @@ def expand_path(path): detailed_canon = { 'experimentName': 'test case', + 'experimentType': 'hpo', 'searchSpaceFile': expand_path('assets/search_space.json'), 'searchSpace': {'a': 1}, 'trialCommand': 'python main.py', diff --git a/test/ut/experiment/test_ts_remote.py b/test/ut/experiment/test_ts_remote.py index 770e6faac9..e7c16972e0 100644 --- a/test/ut/experiment/test_ts_remote.py +++ b/test/ut/experiment/test_ts_remote.py @@ -43,6 +43,7 @@ ) minimal_canon = { + 'experimentType': 'hpo', 'searchSpace': {'a': 1}, 'trialCommand': 'python main.py', 'trialCodeDirectory': os.path.realpath('.'), @@ -106,6 +107,7 @@ } detailed_canon = { + 'experimentType': 'hpo', 'searchSpace': {'a': 1}, 'trialCommand': 'python main.py', 'trialCodeDirectory': os.path.realpath('.'), diff --git a/test/ut/nas/test_graph.py b/test/ut/nas/test_graph.py index 69dd8c52a9..7a8a39900b 100644 --- a/test/ut/nas/test_graph.py +++ b/test/ut/nas/test_graph.py @@ -37,6 +37,10 @@ def _test_file(json_path): # skip comparison of _evaluator orig_ir.pop('_evaluator') dump_ir.pop('_evaluator') + # skip three experiment fields + dump_ir.pop('model_id') + dump_ir.pop('python_class') + dump_ir.pop('python_init_params') assert orig_ir == dump_ir From 31856093541fe1dd11be811d84edb17e82bebb4c Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 9 Aug 2022 15:49:46 +0800 Subject: [PATCH 60/77] minor --- nni/tools/nnictl/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py index 8e7fc33f62..c9f4860672 100644 --- a/nni/tools/nnictl/launcher.py +++ b/nni/tools/nnictl/launcher.py @@ -107,7 +107,7 @@ def resume_experiment(args): exp_class = utils.get_experiment_cls_using_config(config_json) if exp_class is RetiariiExperiment: - RetiariiExperiment.resume(exp_id, port, True, debug) + RetiariiExperiment.resume(exp_id, port, debug) else: exp = Experiment._resume(exp_id, exp_dir) run_mode = RunMode.Foreground if foreground else RunMode.Detach From 0b4cc963867f91e79d0300d3b0805b91b43b02d6 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 9 Aug 2022 17:01:09 +0800 Subject: [PATCH 61/77] resolve comments --- nni/experiment/config/utils/internal.py | 22 ++++++-------- nni/nas/execution/api.py | 38 +++++++++++++++++++++++-- nni/nas/execution/common/utils.py | 38 ++----------------------- nni/nas/experiment/pytorch.py | 3 +- 4 files changed, 49 insertions(+), 52 deletions(-) diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py index e7b6c80a47..5cbe216a74 100644 --- a/nni/experiment/config/utils/internal.py +++ b/nni/experiment/config/utils/internal.py @@ -205,20 +205,16 @@ def get_ipv4_address() -> str: def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]: # avoid circular import from nni.nas.experiment.config import RetiariiExeConfig + from nni.nas.experiment.pytorch import RetiariiExperiment from ..experiment_config import ExperimentConfig - if 'experimentType' in config_json: - if config_json['experimentType'] == 'hpo': - return ExperimentConfig(**config_json) - elif config_json['experimentType'] == 'nas': - return RetiariiExeConfig(**config_json) - else: - raise KeyError(f'Unknown experiment_type: {config_json["experimentType"]}') + from ...experiment import Experiment + exp_cls = get_experiment_cls_using_config(config_json) + if exp_cls is Experiment: + return ExperimentConfig(**config_json) + elif exp_cls is RetiariiExperiment: + return RetiariiExeConfig(**config_json) else: - # for backward compatibility, experiment config <= v2.8 does not have "experiment_type" - if 'executionEngine' in config_json: - return RetiariiExeConfig(**config_json) - else: - return ExperimentConfig(**config_json) + raise TypeError(f'Unsupported experiment type: {type(exp_cls)}') def get_experiment_cls_using_config(config_json): from nni.nas.experiment.pytorch import RetiariiExperiment @@ -229,7 +225,7 @@ def get_experiment_cls_using_config(config_json): elif config_json['experimentType'] == 'nas': return RetiariiExperiment else: - raise KeyError(f'Unknown experiment_type: {config_json["experimentType"]}') + raise ValueError(f'Unknown experiment_type: {config_json["experimentType"]}') else: if 'executionEngine' in config_json: return RetiariiExperiment diff --git a/nni/nas/execution/api.py b/nni/nas/execution/api.py index ef2558e758..4581bbe42f 100644 --- a/nni/nas/execution/api.py +++ b/nni/nas/execution/api.py @@ -3,8 +3,9 @@ import time import warnings -from typing import Iterable +from typing import Iterable, cast +from nni.experiment.config.training_services import RemoteConfig from nni.nas.execution.common import ( Model, ModelStatus, AbstractExecutionEngine, @@ -14,11 +15,44 @@ _execution_engine = None _default_listener = None -__all__ = ['get_execution_engine', 'get_and_register_default_listener', +__all__ = ['init_execution_engine', 'get_execution_engine', 'get_and_register_default_listener', 'list_models', 'submit_models', 'wait_models', 'query_available_resources', 'set_execution_engine', 'is_stopped_exec', 'budget_exhausted'] +def init_execution_engine(config, port, url_prefix) -> AbstractExecutionEngine: + from ..experiment.config import ( + BaseEngineConfig, PyEngineConfig, + CgoEngineConfig, BenchmarkEngineConfig + ) + if isinstance(config.execution_engine, BaseEngineConfig): + from .pytorch.graph import BaseExecutionEngine + return BaseExecutionEngine(port, url_prefix) + elif isinstance(config.execution_engine, CgoEngineConfig): + from .pytorch.cgo.engine import CGOExecutionEngine + + assert not isinstance(config.training_service, list) \ + and config.training_service.platform == 'remote', \ + "CGO execution engine currently only supports remote training service" + assert config.execution_engine.batch_waiting_time is not None \ + and config.execution_engine.max_concurrency_cgo is not None + return CGOExecutionEngine(cast(RemoteConfig, config.training_service), + max_concurrency=config.execution_engine.max_concurrency_cgo, + batch_waiting_time=config.execution_engine.batch_waiting_time, + rest_port=port, + rest_url_prefix=url_prefix) + elif isinstance(config.execution_engine, PyEngineConfig): + from .pytorch.simplified import PurePythonExecutionEngine + return PurePythonExecutionEngine(port, url_prefix) + elif isinstance(config.execution_engine, BenchmarkEngineConfig): + from .pytorch.benchmark import BenchmarkExecutionEngine + assert config.execution_engine.benchmark is not None, \ + '"benchmark" must be set when benchmark execution engine is used.' + return BenchmarkExecutionEngine(config.execution_engine.benchmark) + else: + raise ValueError(f'Unsupported engine type: {config.execution_engine}') + + def set_execution_engine(engine: AbstractExecutionEngine) -> None: global _execution_engine if _execution_engine is not None: diff --git a/nni/nas/execution/common/utils.py b/nni/nas/execution/common/utils.py index 96693f3564..5230ddf706 100644 --- a/nni/nas/execution/common/utils.py +++ b/nni/nas/execution/common/utils.py @@ -1,11 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -__all__ = ['unpack_if_only_one', 'get_mutation_dict', 'mutation_dict_to_summary', 'get_mutation_summary', 'init_execution_engine'] +__all__ = ['unpack_if_only_one', 'get_mutation_dict', 'mutation_dict_to_summary', 'get_mutation_summary'] -from typing import Any, List, cast -from nni.experiment.config.training_services import RemoteConfig -from .engine import AbstractExecutionEngine +from typing import Any, List from .graph import Model @@ -34,35 +32,3 @@ def mutation_dict_to_summary(mutation: dict) -> dict: def get_mutation_summary(model: Model) -> dict: mutation = get_mutation_dict(model) return mutation_dict_to_summary(mutation) - -def init_execution_engine(config, port, url_prefix) -> AbstractExecutionEngine: - from ...experiment.config import ( - BaseEngineConfig, PyEngineConfig, - CgoEngineConfig, BenchmarkEngineConfig - ) - if isinstance(config.execution_engine, BaseEngineConfig): - from ..pytorch.graph import BaseExecutionEngine - return BaseExecutionEngine(port, url_prefix) - elif isinstance(config.execution_engine, CgoEngineConfig): - from ..pytorch.cgo.engine import CGOExecutionEngine - - assert not isinstance(config.training_service, list) \ - and config.training_service.platform == 'remote', \ - "CGO execution engine currently only supports remote training service" - assert config.execution_engine.batch_waiting_time is not None \ - and config.execution_engine.max_concurrency_cgo is not None - return CGOExecutionEngine(cast(RemoteConfig, config.training_service), - max_concurrency=config.execution_engine.max_concurrency_cgo, - batch_waiting_time=config.execution_engine.batch_waiting_time, - rest_port=port, - rest_url_prefix=url_prefix) - elif isinstance(config.execution_engine, PyEngineConfig): - from ..pytorch.simplified import PurePythonExecutionEngine - return PurePythonExecutionEngine(port, url_prefix) - elif isinstance(config.execution_engine, BenchmarkEngineConfig): - from ..pytorch.benchmark import BenchmarkExecutionEngine - assert config.execution_engine.benchmark is not None, \ - '"benchmark" must be set when benchmark execution engine is used.' - return BenchmarkExecutionEngine(config.execution_engine.benchmark) - else: - raise ValueError(f'Unsupported engine type: {config.execution_engine}') diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py index cec8b62945..7b5bde39ac 100644 --- a/nni/nas/experiment/pytorch.py +++ b/nni/nas/experiment/pytorch.py @@ -20,7 +20,8 @@ from nni.experiment import Experiment, RunMode, launcher from nni.nas.execution import list_models, set_execution_engine -from nni.nas.execution.common import RetiariiAdvisor, get_mutation_dict, init_execution_engine, Model +from nni.nas.execution.api import init_execution_engine +from nni.nas.execution.common import RetiariiAdvisor, get_mutation_dict, Model from nni.nas.execution.pytorch.codegen import model_to_pytorch_script from nni.nas.execution.pytorch.converter import convert_to_graph from nni.nas.execution.pytorch.converter.graph_gen import GraphConverterWithShape From e520c0d890b598dcc7b0e6d2e1856e49080da9b5 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 9 Aug 2022 18:07:23 +0800 Subject: [PATCH 62/77] resolve comments --- nni/experiment/config/utils/internal.py | 6 ++++-- nni/nas/experiment/pytorch.py | 26 ++++++++++++++----------- test/ut/nas/test_experiment.py | 1 - 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py index 5cbe216a74..21c4f4b8b0 100644 --- a/nni/experiment/config/utils/internal.py +++ b/nni/experiment/config/utils/internal.py @@ -35,7 +35,9 @@ from .public import is_missing if typing.TYPE_CHECKING: + from nni.nas.experiment.pytorch import RetiariiExperiment from nni.nas.experiment.config import RetiariiExeConfig + from ...experiment import Experiment from ..base import ConfigBase from ..experiment_config import ExperimentConfig from ..training_service import TrainingServiceConfig @@ -202,7 +204,7 @@ def get_ipv4_address() -> str: s.close() return addr -def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, RetiariiExeConfig]: +def load_experiment_config(config_json: dict) -> ExperimentConfig | RetiariiExeConfig: # avoid circular import from nni.nas.experiment.config import RetiariiExeConfig from nni.nas.experiment.pytorch import RetiariiExperiment @@ -216,7 +218,7 @@ def load_experiment_config(config_json) -> typing.Union[ExperimentConfig, Retiar else: raise TypeError(f'Unsupported experiment type: {type(exp_cls)}') -def get_experiment_cls_using_config(config_json): +def get_experiment_cls_using_config(config_json: dict) -> Experiment | RetiariiExperiment: from nni.nas.experiment.pytorch import RetiariiExperiment from ...experiment import Experiment if 'experimentType' in config_json: diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py index 7b5bde39ac..c31ee1e51a 100644 --- a/nni/nas/experiment/pytorch.py +++ b/nni/nas/experiment/pytorch.py @@ -10,7 +10,7 @@ import time import warnings from threading import Thread -from typing import Any, List, cast, Tuple +from typing import Any, List, cast, Tuple, TYPE_CHECKING import colorama @@ -38,6 +38,9 @@ PyEngineConfig, CgoEngineConfig, BenchmarkEngineConfig ) +if TYPE_CHECKING: + from pathlib import Path + _logger = logging.getLogger(__name__) @@ -219,9 +222,9 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None: def _save_experiment_checkpoint(self, base_model_ir: Model, applied_mutators: List[Mutator], - strategy: BaseStrategy) -> None: - ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), - self.id, 'checkpoint') + strategy: BaseStrategy, + exp_work_dir: Path) -> None: + ckp_path = os.path.join(exp_work_dir, self.id, 'checkpoint') with open(os.path.join(ckp_path, 'nas_model'), 'w') as fp: dump(base_model_ir._dump(), fp, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024))) with open(os.path.join(ckp_path, 'applied_mutators'), 'w') as fp: @@ -229,9 +232,8 @@ def _save_experiment_checkpoint(self, with open(os.path.join(ckp_path, 'strategy'), 'w') as fp: dump(strategy, fp) - def _load_experiment_checkpoint(self) -> Tuple[Model, List[Mutator], BaseStrategy]: - ckp_path = os.path.join(os.path.expanduser(self.config.experiment_working_directory), - self.id, 'checkpoint') + def _load_experiment_checkpoint(self, exp_work_dir: Path) -> Tuple[Model, List[Mutator], BaseStrategy]: + ckp_path = os.path.join(exp_work_dir, self.id, 'checkpoint') with open(os.path.join(ckp_path, 'nas_model'), 'r') as fp: base_model_ir = load(fp=fp) base_model_ir = Model._load(base_model_ir) @@ -297,9 +299,11 @@ def run(self, dummy_input=canonicalized_config.execution_engine.dummy_input if isinstance(canonicalized_config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None ) - self._save_experiment_checkpoint(base_model_ir, self.applied_mutators, self.strategy) + self._save_experiment_checkpoint(base_model_ir, self.applied_mutators, self.strategy, + canonicalized_config.experiment_working_directory) elif self._action == 'resume': - base_model_ir, self.applied_mutators, self.strategy = self._load_experiment_checkpoint() + base_model_ir, self.applied_mutators, self.strategy = self._load_experiment_checkpoint( + canonicalized_config.experiment_working_directory) else: raise RuntimeError(f'The experiment mode "{self._action}" is not supposed to invoke run() method.') @@ -369,7 +373,7 @@ def export_top_models(self, top_k: int = 1, optimize_mode: str = 'maximize', for return [get_mutation_dict(model) for model in all_models[:top_k]] @staticmethod - def view(experiment_id: str, port: int = 8080, non_blocking: bool = False): + def view(experiment_id: str, port: int = 8080, non_blocking: bool = False) -> RetiariiExperiment | None: """ View a stopped experiment. @@ -397,7 +401,7 @@ def view(experiment_id: str, port: int = 8080, non_blocking: bool = False): experiment.stop() @staticmethod - def resume(experiment_id: str, port: int = 8080, debug: bool = False): + def resume(experiment_id: str, port: int = 8080, debug: bool = False) -> RetiariiExperiment: """ Resume a stopped experiment. diff --git a/test/ut/nas/test_experiment.py b/test/ut/nas/test_experiment.py index 28ea8edc00..f60693acd3 100644 --- a/test/ut/nas/test_experiment.py +++ b/test/ut/nas/test_experiment.py @@ -1,4 +1,3 @@ -from asyncio import subprocess import os import subprocess import sys From a6cb74d763a570af50c7a969ebbfbca215bf1816 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 9 Aug 2022 19:16:58 +0800 Subject: [PATCH 63/77] fix pylint --- nni/experiment/config/utils/internal.py | 2 +- nni/nas/experiment/pytorch.py | 33 +++++++++++-------------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py index 21c4f4b8b0..8e1e30b160 100644 --- a/nni/experiment/config/utils/internal.py +++ b/nni/experiment/config/utils/internal.py @@ -218,7 +218,7 @@ def load_experiment_config(config_json: dict) -> ExperimentConfig | RetiariiExeC else: raise TypeError(f'Unsupported experiment type: {type(exp_cls)}') -def get_experiment_cls_using_config(config_json: dict) -> Experiment | RetiariiExperiment: +def get_experiment_cls_using_config(config_json: dict) -> type[Experiment] | type[RetiariiExperiment]: from nni.nas.experiment.pytorch import RetiariiExperiment from ...experiment import Experiment if 'experimentType' in config_json: diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py index c31ee1e51a..c85102c405 100644 --- a/nni/nas/experiment/pytorch.py +++ b/nni/nas/experiment/pytorch.py @@ -39,7 +39,7 @@ ) if TYPE_CHECKING: - from pathlib import Path + from nni.experiment.config.utils import PathLike _logger = logging.getLogger(__name__) @@ -195,9 +195,9 @@ def __init__(self, base_model: nn.Module = cast(nn.Module, None), # check for sanity if not is_model_wrapped(base_model): warnings.warn(colorama.Style.BRIGHT + colorama.Fore.RED + - '`@model_wrapper` is missing for the base model. The experiment might still be able to run, ' - 'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL, - RuntimeWarning) + '`@model_wrapper` is missing for the base model. The experiment might still be able to run, ' + 'but it may cause inconsistent behavior compared to the time when you add it.' + colorama.Style.RESET_ALL, + RuntimeWarning) self.base_model = base_model self.evaluator: Evaluator = evaluator @@ -219,11 +219,8 @@ def _create_execution_engine(self, config: RetiariiExeConfig) -> None: engine = init_execution_engine(config, self.port, self.url_prefix) set_execution_engine(engine) - def _save_experiment_checkpoint(self, - base_model_ir: Model, - applied_mutators: List[Mutator], - strategy: BaseStrategy, - exp_work_dir: Path) -> None: + def _save_experiment_checkpoint(self, base_model_ir: Model, applied_mutators: List[Mutator], + strategy: BaseStrategy, exp_work_dir: PathLike) -> None: ckp_path = os.path.join(exp_work_dir, self.id, 'checkpoint') with open(os.path.join(ckp_path, 'nas_model'), 'w') as fp: dump(base_model_ir._dump(), fp, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024))) @@ -232,7 +229,7 @@ def _save_experiment_checkpoint(self, with open(os.path.join(ckp_path, 'strategy'), 'w') as fp: dump(strategy, fp) - def _load_experiment_checkpoint(self, exp_work_dir: Path) -> Tuple[Model, List[Mutator], BaseStrategy]: + def _load_experiment_checkpoint(self, exp_work_dir: PathLike) -> Tuple[Model, List[Mutator], BaseStrategy]: ckp_path = os.path.join(exp_work_dir, self.id, 'checkpoint') with open(os.path.join(ckp_path, 'nas_model'), 'r') as fp: base_model_ir = load(fp=fp) @@ -284,26 +281,26 @@ def run(self, self.strategy.run(base_model_ir, self.applied_mutators) else: ws_url = f'ws://localhost:{port}/tuner' - canonicalized_config = self._start_impl(port, debug, RunMode.Background, ws_url, ['retiarii']) - canonicalized_config = cast(RetiariiExeConfig, canonicalized_config) + canoni_conf = self._start_impl(port, debug, RunMode.Background, ws_url, ['retiarii']) + canoni_conf = cast(RetiariiExeConfig, canoni_conf) self._dispatcher = RetiariiAdvisor(ws_url) self._dispatcher_thread = Thread(target=self._dispatcher.run, daemon=True) self._dispatcher_thread.start() # FIXME: engine cannot be created twice - self._create_execution_engine(canonicalized_config) + self._create_execution_engine(canoni_conf) try: if self._action == 'create': base_model_ir, self.applied_mutators = preprocess_model( self.base_model, self.evaluator, self.applied_mutators, - full_ir=not isinstance(canonicalized_config.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)), - dummy_input=canonicalized_config.execution_engine.dummy_input - if isinstance(canonicalized_config.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None + full_ir=not isinstance(canoni_conf.execution_engine, (PyEngineConfig, BenchmarkEngineConfig)), + dummy_input=canoni_conf.execution_engine.dummy_input + if isinstance(canoni_conf.execution_engine, (BaseEngineConfig, CgoEngineConfig)) else None ) self._save_experiment_checkpoint(base_model_ir, self.applied_mutators, self.strategy, - canonicalized_config.experiment_working_directory) + canoni_conf.experiment_working_directory) elif self._action == 'resume': base_model_ir, self.applied_mutators, self.strategy = self._load_experiment_checkpoint( - canonicalized_config.experiment_working_directory) + canoni_conf.experiment_working_directory) else: raise RuntimeError(f'The experiment mode "{self._action}" is not supposed to invoke run() method.') From e6f0fea4ffee35b325ea2b96a52d207050d42d49 Mon Sep 17 00:00:00 2001 From: quzha Date: Wed, 10 Aug 2022 11:17:29 +0800 Subject: [PATCH 64/77] resolve comments --- nni/experiment/config/utils/internal.py | 42 ++++++++++++------------- nni/nas/experiment/pytorch.py | 9 +++--- nni/tools/nnictl/launcher.py | 23 +++++++------- test/ut/nas/test_experiment.py | 4 ++- 4 files changed, 39 insertions(+), 39 deletions(-) diff --git a/nni/experiment/config/utils/internal.py b/nni/experiment/config/utils/internal.py index 8e1e30b160..88c7ee3c0d 100644 --- a/nni/experiment/config/utils/internal.py +++ b/nni/experiment/config/utils/internal.py @@ -26,7 +26,7 @@ import os.path from pathlib import Path import socket -import typing +from typing import Tuple, TYPE_CHECKING, get_type_hints import typeguard @@ -34,7 +34,7 @@ from .public import is_missing -if typing.TYPE_CHECKING: +if TYPE_CHECKING: from nni.nas.experiment.pytorch import RetiariiExperiment from nni.nas.experiment.config import RetiariiExeConfig from ...experiment import Experiment @@ -83,7 +83,7 @@ def fields(config: ConfigBase) -> list[dataclasses.Field]: # Similar to `dataclasses.fields()`, but use `typing.get_types_hints()` to get `field.type`. # This is useful when postponed evaluation is enabled. ret = [copy.copy(field) for field in dataclasses.fields(config)] - types = typing.get_type_hints(type(config)) + types = get_type_hints(type(config)) for field in ret: field.type = types[field.name] return ret @@ -205,31 +205,29 @@ def get_ipv4_address() -> str: return addr def load_experiment_config(config_json: dict) -> ExperimentConfig | RetiariiExeConfig: - # avoid circular import - from nni.nas.experiment.config import RetiariiExeConfig - from nni.nas.experiment.pytorch import RetiariiExperiment - from ..experiment_config import ExperimentConfig - from ...experiment import Experiment - exp_cls = get_experiment_cls_using_config(config_json) - if exp_cls is Experiment: - return ExperimentConfig(**config_json) - elif exp_cls is RetiariiExperiment: - return RetiariiExeConfig(**config_json) - else: - raise TypeError(f'Unsupported experiment type: {type(exp_cls)}') + _, exp_conf_cls = get_experiment_cls_using_config(config_json) + return exp_conf_cls(**config_json) -def get_experiment_cls_using_config(config_json: dict) -> type[Experiment] | type[RetiariiExperiment]: - from nni.nas.experiment.pytorch import RetiariiExperiment - from ...experiment import Experiment +def get_experiment_cls_using_config(config_json: dict) -> Tuple[type[Experiment] | type[RetiariiExperiment], + type[ExperimentConfig] | type[RetiariiExeConfig]]: + # avoid circular import and unnecessary dependency on pytorch if 'experimentType' in config_json: if config_json['experimentType'] == 'hpo': - return Experiment + from ...experiment import Experiment + from ..experiment_config import ExperimentConfig + return Experiment, ExperimentConfig elif config_json['experimentType'] == 'nas': - return RetiariiExperiment + from nni.nas.experiment.pytorch import RetiariiExperiment + from nni.nas.experiment.config import RetiariiExeConfig + return RetiariiExperiment, RetiariiExeConfig else: raise ValueError(f'Unknown experiment_type: {config_json["experimentType"]}') else: if 'executionEngine' in config_json: - return RetiariiExperiment + from nni.nas.experiment.pytorch import RetiariiExperiment + from nni.nas.experiment.config import RetiariiExeConfig + return RetiariiExperiment, RetiariiExeConfig else: - return Experiment + from ...experiment import Experiment + from ..experiment_config import ExperimentConfig + return Experiment, ExperimentConfig diff --git a/nni/nas/experiment/pytorch.py b/nni/nas/experiment/pytorch.py index c85102c405..60c5939176 100644 --- a/nni/nas/experiment/pytorch.py +++ b/nni/nas/experiment/pytorch.py @@ -258,11 +258,10 @@ def run(self, """ from nni.retiarii.oneshot.interface import BaseOneShotTrainer if isinstance(self.evaluator, BaseOneShotTrainer): - # TODO: will throw a deprecation warning soon - # warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. ' - # 'We will try to convert this trainer to our new implementation to run the algorithm. ' - # 'In case you want to stick to the old implementation, ' - # 'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning) + warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. ' + 'We will try to convert this trainer to our new implementation to run the algorithm. ' + 'In case you want to stick to the old implementation, ' + 'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning) self.evaluator.fit() return diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py index c9f4860672..72219669fa 100644 --- a/nni/tools/nnictl/launcher.py +++ b/nni/tools/nnictl/launcher.py @@ -11,7 +11,6 @@ from nni.experiment import Experiment, RunMode from nni.experiment.config import ExperimentConfig, convert, utils -from nni.nas.experiment.pytorch import RetiariiExperiment from nni.tools.annotation import expand_annotations, generate_search_space # used for v1-only legacy setup, remove them later @@ -105,13 +104,14 @@ def resume_experiment(args): legacy_launcher.resume_experiment(args) exit() - exp_class = utils.get_experiment_cls_using_config(config_json) - if exp_class is RetiariiExperiment: - RetiariiExperiment.resume(exp_id, port, debug) - else: - exp = Experiment._resume(exp_id, exp_dir) + exp_cls, _ = utils.get_experiment_cls_using_config(config_json) + if exp_cls is Experiment: + exp = exp_cls._resume(exp_id, exp_dir) run_mode = RunMode.Foreground if foreground else RunMode.Detach exp.start(port, debug, run_mode) + else: + # exp_cls is RetiariiExperiment + exp_cls.resume(exp_id, port, debug) def view_experiment(args): exp_id = args.id @@ -123,9 +123,10 @@ def view_experiment(args): legacy_launcher.view_experiment(args) exit() - exp_class = utils.get_experiment_cls_using_config(config_json) - if exp_class is RetiariiExperiment: - RetiariiExperiment.view(exp_id, port, non_blocking=True) - else: - exp = Experiment._view(exp_id, exp_dir) + exp_cls, _ = utils.get_experiment_cls_using_config(config_json) + if exp_cls is Experiment: + exp = exp_cls._view(exp_id, exp_dir) exp.start(port, run_mode=RunMode.Detach) + else: + # exp_cls is RetiariiExperiment + exp_cls.view(exp_id, port, non_blocking=True) diff --git a/test/ut/nas/test_experiment.py b/test/ut/nas/test_experiment.py index f60693acd3..8640d58c17 100644 --- a/test/ut/nas/test_experiment.py +++ b/test/ut/nas/test_experiment.py @@ -113,6 +113,7 @@ def test_multitrial_experiment(pytestconfig): assert isinstance(exp.export_top_models()[0], dict) exp.stop() + def test_multitrial_experiment_resume_view(pytestconfig): # start a normal nas experiment base_model = Net() @@ -145,7 +146,7 @@ def test_multitrial_experiment_resume_view(pytestconfig): # the following is nnictl resume and view print('nnictl resume...') new_env = os.environ.copy() - new_env['PYTHONPATH'] = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)) + new_env['PYTHONPATH'] = pytestconfig.rootpath proc = subprocess.run(f'nnictl resume {exp_id}', shell=True, env=new_env) assert proc.returncode == 0, 'resume nas experiment failed with code %d' % proc.returncode print('nnictl view...') @@ -154,6 +155,7 @@ def test_multitrial_experiment_resume_view(pytestconfig): proc = subprocess.run(f'nnictl stop {exp_id}', shell=True) assert proc.returncode == 0, 'stop viewed nas experiment failed with code %d' % proc.returncode + def test_oneshot_experiment(): base_model = Net() evaluator = get_mnist_evaluator() From f4faa739c006cc5c9115dd01e0151e9f3da470f6 Mon Sep 17 00:00:00 2001 From: quzha Date: Wed, 10 Aug 2022 12:48:15 +0800 Subject: [PATCH 65/77] move test --- test/algo/nas/test_multitrial.py | 43 +++++++++++++++++++++++++++++++- test/ut/nas/test_experiment.py | 43 -------------------------------- 2 files changed, 42 insertions(+), 44 deletions(-) diff --git a/test/algo/nas/test_multitrial.py b/test/algo/nas/test_multitrial.py index cb7d395aea..14ccd4bf39 100644 --- a/test/algo/nas/test_multitrial.py +++ b/test/algo/nas/test_multitrial.py @@ -1,6 +1,6 @@ import multiprocessing import os -import sys +import subprocess import time import pytest @@ -76,3 +76,44 @@ def test_exp_exit_without_stop(pytestconfig): return process.kill() raise RuntimeError(f'Experiment fails to stop in {timeout} seconds.') + + +def test_multitrial_experiment_resume_view(pytestconfig): + # start a normal nas experiment + base_model, evaluator = _mnist_net('simple', {'max_epochs': 1}) + search_strategy = strategy.Random() + exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy) + exp_id = exp.id + exp_config = RetiariiExeConfig('local') + exp_config.trial_concurrency = 1 + exp_config.max_trial_number = 1 + exp_config._trial_command_params = nas_experiment_trial_params(pytestconfig.rootpath) + exp.run(exp_config) + ensure_success(exp) + assert isinstance(exp.export_top_models()[0], dict) + exp.stop() + + # resume the above nas experiment. only tested the resume logic in the python side, + # as no more trial is executed after resume, the above experiment is already finished + print('python api resume...') + exp = RetiariiExperiment.resume(exp_id) + ensure_success(exp) + # TODO: currently `export_top_models` does not work as strategy's states are not resumed + # assert isinstance(exp.export_top_models()[0], dict) + exp.stop() + # view the above experiment in non blocking mode then stop it + print('python api view...') + exp = RetiariiExperiment.view(exp_id, non_blocking=True) + exp.stop() + + # the following is nnictl resume and view + print('nnictl resume...') + new_env = os.environ.copy() + new_env['PYTHONPATH'] = str(pytestconfig.rootpath) + proc = subprocess.run(f'nnictl resume {exp_id}', shell=True, env=new_env) + assert proc.returncode == 0, 'resume nas experiment failed with code %d' % proc.returncode + print('nnictl view...') + proc = subprocess.run(f'nnictl view {exp_id}', shell=True) + assert proc.returncode == 0, 'view nas experiment failed with code %d' % proc.returncode + proc = subprocess.run(f'nnictl stop {exp_id}', shell=True) + assert proc.returncode == 0, 'stop viewed nas experiment failed with code %d' % proc.returncode \ No newline at end of file diff --git a/test/ut/nas/test_experiment.py b/test/ut/nas/test_experiment.py index 8640d58c17..991e3b23f6 100644 --- a/test/ut/nas/test_experiment.py +++ b/test/ut/nas/test_experiment.py @@ -1,5 +1,4 @@ import os -import subprocess import sys import nni @@ -114,48 +113,6 @@ def test_multitrial_experiment(pytestconfig): exp.stop() -def test_multitrial_experiment_resume_view(pytestconfig): - # start a normal nas experiment - base_model = Net() - evaluator = get_mnist_evaluator() - search_strategy = strategy.Random() - exp = RetiariiExperiment(base_model, evaluator, strategy=search_strategy) - exp_id = exp.id - exp_config = RetiariiExeConfig('local') - exp_config.trial_concurrency = 1 - exp_config.max_trial_number = 1 - exp_config._trial_command_params = nas_experiment_trial_params(pytestconfig.rootpath) - exp.run(exp_config) - ensure_success(exp) - assert isinstance(exp.export_top_models()[0], dict) - exp.stop() - - # resume the above nas experiment. only tested the resume logic in the python side, - # as no more trial is executed after resume, the above experiment is already finished - print('python api resume...') - exp = RetiariiExperiment.resume(exp_id) - ensure_success(exp) - # TODO: currently `export_top_models` does not work as strategy's states are not resumed - # assert isinstance(exp.export_top_models()[0], dict) - exp.stop() - # view the above experiment in non blocking mode then stop it - print('python api view...') - exp = RetiariiExperiment.view(exp_id, non_blocking=True) - exp.stop() - - # the following is nnictl resume and view - print('nnictl resume...') - new_env = os.environ.copy() - new_env['PYTHONPATH'] = pytestconfig.rootpath - proc = subprocess.run(f'nnictl resume {exp_id}', shell=True, env=new_env) - assert proc.returncode == 0, 'resume nas experiment failed with code %d' % proc.returncode - print('nnictl view...') - proc = subprocess.run(f'nnictl view {exp_id}', shell=True) - assert proc.returncode == 0, 'view nas experiment failed with code %d' % proc.returncode - proc = subprocess.run(f'nnictl stop {exp_id}', shell=True) - assert proc.returncode == 0, 'stop viewed nas experiment failed with code %d' % proc.returncode - - def test_oneshot_experiment(): base_model = Net() evaluator = get_mnist_evaluator() From eab49a62a66ffba8fee0b95afd80812102c39152 Mon Sep 17 00:00:00 2001 From: quzha Date: Fri, 26 Aug 2022 20:00:50 +0800 Subject: [PATCH 66/77] fix bug and add doc --- README.md | 3 ++- docs/source/notes/research_publications.rst | 26 +++++++++++++++++++++ nni/nas/execution/common/integration.py | 3 +++ test/algo/nas/test_multitrial.py | 7 ++++++ 4 files changed, 38 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1c9eae6fce..e3bf4eb92f 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,8 @@ NNI automates feature engineering, neural architecture search, hyperparameter tu * **New release**: [v2.8 is available](https://github.com/microsoft/nni/releases/tag/v2.8) - _released on June-22-2022_ * **New demo available**: [Youtube entry](https://www.youtube.com/channel/UCKcafm6861B2mnYhPbZHavw) | [Bilibili 入口](https://space.bilibili.com/1649051673) - _last updated on June-22-2022_ -* **New webinar**: [Introducing Retiarii: A deep learning exploratory-training framework on NNI](https://note.microsoft.com/MSR-Webinar-Retiarii-Registration-Live.html) - _scheduled on June-24-2021_ +* **New research paper**: [SparTA: Deep-Learning Model Sparsity via Tensor-with-Sparsity-Attribute](https://www.usenix.org/system/files/osdi22-zheng-ningxin.pdf) - _published in OSDI 2022_ +* **New research paper**: [Privacy-preserving Online AutoML for Domain-Specific Face Detection](https://openaccess.thecvf.com/content/CVPR2022/papers/Yan_Privacy-Preserving_Online_AutoML_for_Domain-Specific_Face_Detection_CVPR_2022_paper.pdf) - _published in CVPR 2022_ * **Newly upgraded documentation**: [Doc upgraded](https://nni.readthedocs.io/en/stable) diff --git a/docs/source/notes/research_publications.rst b/docs/source/notes/research_publications.rst index 1ca12930e0..9ab0d6eaaa 100644 --- a/docs/source/notes/research_publications.rst +++ b/docs/source/notes/research_publications.rst @@ -7,6 +7,19 @@ System Research --------------- +* `SparTA: Deep-Learning Model Sparsity via Tensor-with-Sparsity-Attribute `__ + +.. code-block:: bibtex + + @inproceedings{zheng2022sparta, + title={$\{$SparTA$\}$:$\{$Deep-Learning$\}$ Model Sparsity via $\{$Tensor-with-Sparsity-Attribute$\}$}, + author={Zheng, Ningxin and Lin, Bin and Zhang, Quanlu and Ma, Lingxiao and Yang, Yuqing and Yang, Fan and Wang, Yang and Yang, Mao and Zhou, Lidong}, + booktitle={16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)}, + pages={213--232}, + year={2022} + } + + * `Retiarii: A Deep Learning Exploratory-Training Framework `__ .. code-block:: bibtex @@ -52,6 +65,19 @@ New Algorithms ^^^^^^^^^^^^^^ +* `Privacy-preserving Online AutoML for Domain-Specific Face Detection `__ + +.. code-block:: bibtex + + @inproceedings{yan2022privacy, + title={Privacy-preserving Online AutoML for Domain-Specific Face Detection}, + author={Yan, Chenqian and Zhang, Yuge and Zhang, Quanlu and Yang, Yaming and Jiang, Xinyang and Yang, Yuqing and Wang, Baoyuan}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={4134--4144}, + year={2022} + } + + * `TextNAS: A Neural Architecture Search Space Tailored for Text Representation `__ .. code-block:: bibtex diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py index ce04b79f5a..3deb319ea8 100644 --- a/nni/nas/execution/common/integration.py +++ b/nni/nas/execution/common/integration.py @@ -237,3 +237,6 @@ def _process_value(value) -> Any: # hopefully a float def handle_import_data(self, data): # FIXME: ignore imported data for now, as strategy has not supported resume pass + + def handle_add_customized_trial(self, data): + pass diff --git a/test/algo/nas/test_multitrial.py b/test/algo/nas/test_multitrial.py index 14ccd4bf39..81de742d2d 100644 --- a/test/algo/nas/test_multitrial.py +++ b/test/algo/nas/test_multitrial.py @@ -98,18 +98,25 @@ def test_multitrial_experiment_resume_view(pytestconfig): print('python api resume...') exp = RetiariiExperiment.resume(exp_id) ensure_success(exp) + # sleep here because there would be several seconds for the experiment status to change + # to ERROR from INITIALIZED/RUNNING if the resume gets error. + time.sleep(5) + assert exp.get_status() == 'DONE', f'The experiment status should not be {exp.get_status()}' # TODO: currently `export_top_models` does not work as strategy's states are not resumed # assert isinstance(exp.export_top_models()[0], dict) exp.stop() # view the above experiment in non blocking mode then stop it print('python api view...') exp = RetiariiExperiment.view(exp_id, non_blocking=True) + assert exp.get_status() == 'VIEWED', f'The experiment status should not be {exp.get_status()}' exp.stop() # the following is nnictl resume and view print('nnictl resume...') new_env = os.environ.copy() new_env['PYTHONPATH'] = str(pytestconfig.rootpath) + # NOTE: experiment status (e.g., ERROR) is not checked, because it runs in blocking mode and + # the rest server exits right after the command is done proc = subprocess.run(f'nnictl resume {exp_id}', shell=True, env=new_env) assert proc.returncode == 0, 'resume nas experiment failed with code %d' % proc.returncode print('nnictl view...') From bccb126beb025ffd53d2d90ea884bb33cb924678 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 29 Aug 2022 13:48:21 +0800 Subject: [PATCH 67/77] resolve comment --- test/algo/nas/test_multitrial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/algo/nas/test_multitrial.py b/test/algo/nas/test_multitrial.py index 81de742d2d..0257d6ba02 100644 --- a/test/algo/nas/test_multitrial.py +++ b/test/algo/nas/test_multitrial.py @@ -100,7 +100,7 @@ def test_multitrial_experiment_resume_view(pytestconfig): ensure_success(exp) # sleep here because there would be several seconds for the experiment status to change # to ERROR from INITIALIZED/RUNNING if the resume gets error. - time.sleep(5) + time.sleep(6) assert exp.get_status() == 'DONE', f'The experiment status should not be {exp.get_status()}' # TODO: currently `export_top_models` does not work as strategy's states are not resumed # assert isinstance(exp.export_top_models()[0], dict) From 4d1778318773629dc68ee18cc3bc1a48f4d98b67 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 5 Sep 2022 14:42:02 +0800 Subject: [PATCH 68/77] fix issue --- nni/nas/execution/common/integration.py | 31 ++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py index 3deb319ea8..42f41c451d 100644 --- a/nni/nas/execution/common/integration.py +++ b/nni/nas/execution/common/integration.py @@ -60,7 +60,8 @@ def __init__(self, url: str): self.final_metric_callback: Optional[Callable[[int, MetricData], None]] = None self.parameters_count = 0 - + # for dealing with the resumed running trials of the before-resumed experiment + self.previous_max_param_id = 0 # Sometimes messages arrive first before the callbacks get registered. # Or in case that we allow engine to be absent during the experiment. # Here we need to store the messages and invoke them later. @@ -212,10 +213,22 @@ def handle_update_search_space(self, data): self.search_space = data def handle_trial_end(self, data): + # TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring + id_ = nni.load(data['hyper_params'])['parameter_id'] + if id_ <= self.previous_max_param_id: + _logger.info('The end of the recovered trial %d is ignored', id_) + return _logger.debug('Trial end: %s', data) - self.invoke_callback('trial_end', nni.load(data['hyper_params'])['parameter_id'], data['event'] == 'SUCCEEDED') + self.invoke_callback('trial_end', id_, data['event'] == 'SUCCEEDED') def handle_report_metric_data(self, data): + # TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring + if data['parameter_id'] <= self.previous_max_param_id: + _logger.info('The metrics of the recovered trial %d are ignored', data['parameter_id']) + return + # NOTE: this part is not aligned with hpo tuners. + # in hpo tuners, trial_job_id is used for intermediate results handling + # parameter_id is for final result handling. _logger.debug('Metric reported: %s', data) if data['type'] == MetricType.REQUEST_PARAMETER: raise ValueError('Request parameter not supported') @@ -239,4 +252,16 @@ def handle_import_data(self, data): pass def handle_add_customized_trial(self, data): - pass + # this is for handling the resuming of the interrupted data: parameters + if not isinstance(data, list): + data = [data] + + for trial in data: + # {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...} + if isinstance(trial, str): + trial = nni.load(trial) + if self.previous_max_param_id < trial['parameter_id']: + self.previous_max_param_id = trial['parameter_id'] + self.parameters_count = self.previous_max_param_id + + # TODO: handle customized trials From 097a7817213c5f42b918054aa0de9bd379f37f35 Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 5 Sep 2022 21:24:37 +0800 Subject: [PATCH 69/77] fix --- .../hpo/bohb_advisor/bohb_advisor.py | 15 +++++++++- nni/algorithms/hpo/hyperband_advisor.py | 11 ++++++- nni/algorithms/hpo/tpe_tuner.py | 13 -------- nni/nas/execution/common/integration.py | 28 +++++++---------- nni/nas/execution/common/integration_api.py | 10 ++++++- nni/recoverable.py | 30 +++++++++++++++++++ nni/runtime/msg_dispatcher.py | 21 ++++++++----- nni/runtime/msg_dispatcher_base.py | 1 + nni/tuner.py | 8 ----- 9 files changed, 88 insertions(+), 49 deletions(-) diff --git a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py index 9127ddaddc..56adad67f2 100644 --- a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py +++ b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py @@ -648,6 +648,9 @@ def handle_trial_end(self, data): event: the job's state hyper_params: the hyperparameters (a string) generated and returned by tuner """ + if self.is_created_in_previous_exp(data['parameter_id']): + # The end of the recovered trial is ignored + return logger.debug('Tuner handle trial end, result is %s', data) hyper_params = nni.load(data['hyper_params']) self._handle_trial_end(hyper_params['parameter_id']) @@ -695,6 +698,13 @@ def handle_report_metric_data(self, data): ValueError Data type not supported """ + if self.is_created_in_previous_exp(data['parameter_id']): + if data['type'] == MetricType.FINAL: + # only deal with final metric using import data + param = self.get_previous_param(data['parameter_id']) + trial_data = [{'parameter': param, 'value': nni.load(data['value'])}] + self.handle_import_data(trial_data) + return logger.debug('handle report metric data = %s', data) if 'value' in data: data['value'] = nni.load(data['value']) @@ -752,7 +762,10 @@ def handle_report_metric_data(self, data): 'Data type not supported: {}'.format(data['type'])) def handle_add_customized_trial(self, data): - pass + global _next_parameter_id + # data: parameters + previous_max_param_id = self.recover_parameter_id(data) + _next_parameter_id = previous_max_param_id + 1 def handle_import_data(self, data): """Import additional data for tuning diff --git a/nni/algorithms/hpo/hyperband_advisor.py b/nni/algorithms/hpo/hyperband_advisor.py index 899696137d..f60273fc27 100644 --- a/nni/algorithms/hpo/hyperband_advisor.py +++ b/nni/algorithms/hpo/hyperband_advisor.py @@ -521,6 +521,9 @@ def handle_trial_end(self, data): event: the job's state hyper_params: the hyperparameters (a string) generated and returned by tuner """ + if self.is_created_in_previous_exp(data['parameter_id']): + # The end of the recovered trial is ignored + return hyper_params = nni.load(data['hyper_params']) self._handle_trial_end(hyper_params['parameter_id']) if data['trial_job_id'] in self.job_id_para_id_map: @@ -538,6 +541,9 @@ def handle_report_metric_data(self, data): ValueError Data type not supported """ + if self.is_created_in_previous_exp(data['parameter_id']): + # do not support recovering the algorithm state + return if 'value' in data: data['value'] = nni.load(data['value']) # multiphase? need to check @@ -576,7 +582,10 @@ def handle_report_metric_data(self, data): raise ValueError('Data type not supported: {}'.format(data['type'])) def handle_add_customized_trial(self, data): - pass + global _next_parameter_id + # data: parameters + previous_max_param_id = self.recover_parameter_id(data) + _next_parameter_id = previous_max_param_id + 1 def handle_import_data(self, data): pass diff --git a/nni/algorithms/hpo/tpe_tuner.py b/nni/algorithms/hpo/tpe_tuner.py index 463c18f7d9..b9df04c3a4 100644 --- a/nni/algorithms/hpo/tpe_tuner.py +++ b/nni/algorithms/hpo/tpe_tuner.py @@ -218,19 +218,6 @@ def import_data(self, data): # for resuming experiment self.dedup.add_history(param) _logger.info(f'Replayed {len(data)} FINISHED trials') - def import_customized_data(self, data): # for dedup customized / resumed - if isinstance(data, str): - data = nni.load(data) - - for trial in data: - # {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...} - if isinstance(trial, str): - trial = nni.load(trial) - param = format_parameters(trial['parameters'], self.space) - self._running_params[trial['parameter_id']] = param - self.dedup.add_history(param) - _logger.info(f'Replayed {len(data)} RUNING/WAITING trials') - def suggest(args, rng, space, history): params = {} for key, spec in space.items(): diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py index 42f41c451d..7f34021ad9 100644 --- a/nni/nas/execution/common/integration.py +++ b/nni/nas/execution/common/integration.py @@ -60,12 +60,16 @@ def __init__(self, url: str): self.final_metric_callback: Optional[Callable[[int, MetricData], None]] = None self.parameters_count = 0 - # for dealing with the resumed running trials of the before-resumed experiment - self.previous_max_param_id = 0 # Sometimes messages arrive first before the callbacks get registered. # Or in case that we allow engine to be absent during the experiment. # Here we need to store the messages and invoke them later. self.call_queue: List[Tuple[str, list]] = [] + # this is for waiting the to-be-recovered trials from nnimanager + self._advisor_initialized = False + + @property + def initialized(self): + return self._advisor_initialized def register_callbacks(self, callbacks: Dict[str, Callable[..., None]]): """ @@ -215,7 +219,7 @@ def handle_update_search_space(self, data): def handle_trial_end(self, data): # TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring id_ = nni.load(data['hyper_params'])['parameter_id'] - if id_ <= self.previous_max_param_id: + if self.is_created_in_previous_exp(id_): _logger.info('The end of the recovered trial %d is ignored', id_) return _logger.debug('Trial end: %s', data) @@ -223,7 +227,7 @@ def handle_trial_end(self, data): def handle_report_metric_data(self, data): # TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring - if data['parameter_id'] <= self.previous_max_param_id: + if self.is_created_in_previous_exp(data['parameter_id']): _logger.info('The metrics of the recovered trial %d are ignored', data['parameter_id']) return # NOTE: this part is not aligned with hpo tuners. @@ -252,16 +256,6 @@ def handle_import_data(self, data): pass def handle_add_customized_trial(self, data): - # this is for handling the resuming of the interrupted data: parameters - if not isinstance(data, list): - data = [data] - - for trial in data: - # {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...} - if isinstance(trial, str): - trial = nni.load(trial) - if self.previous_max_param_id < trial['parameter_id']: - self.previous_max_param_id = trial['parameter_id'] - self.parameters_count = self.previous_max_param_id - - # TODO: handle customized trials + previous_max_param_id = self.recover_parameter_id(data) + self.parameters_count = previous_max_param_id + self._advisor_initialized = True diff --git a/nni/nas/execution/common/integration_api.py b/nni/nas/execution/common/integration_api.py index f7f08adb31..37c381bf6b 100644 --- a/nni/nas/execution/common/integration_api.py +++ b/nni/nas/execution/common/integration_api.py @@ -6,12 +6,16 @@ '_advisor' # FIXME: hack to make it importable for tests ] +import logging +import time import warnings from typing import NewType, Any import nni from nni.common.version import version_check +_logger = logging.getLogger(__name__) + # NOTE: this is only for passing flake8, we cannot import RetiariiAdvisor # because it would induce cycled import RetiariiAdvisor = NewType('RetiariiAdvisor', Any) @@ -41,7 +45,11 @@ def send_trial(parameters: dict, placement_constraint=None) -> int: Send a new trial. Executed on tuner end. Return a ID that is the unique identifier for this trial. """ - return get_advisor().send_trial(parameters, placement_constraint) + advisor = get_advisor() + while not advisor.initialized: + _logger.info('Wait for RetiariiAdvisor to be initialized...') + time.sleep(0.5) + return advisor.send_trial(parameters, placement_constraint) def receive_trial_parameters() -> dict: """ diff --git a/nni/recoverable.py b/nni/recoverable.py index 4ff419b8f9..2992e0e172 100644 --- a/nni/recoverable.py +++ b/nni/recoverable.py @@ -4,8 +4,12 @@ from __future__ import annotations import os +import nni class Recoverable: + def __init__(self): + self.recovered_max_param_id = -1 + self.recovered_trial_params = {} def load_checkpoint(self) -> None: pass @@ -18,3 +22,29 @@ def get_checkpoint_path(self) -> str | None: if ckp_path is not None and os.path.isdir(ckp_path): return ckp_path return None + + def recover_parameter_id(self, data) -> int: + # this is for handling the resuming of the interrupted data: parameters + if not isinstance(data, list): + data = [data] + + previous_max_param_id = 0 + for trial in data: + # {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...} + if isinstance(trial, str): + trial = nni.load(trial) + if not isinstance(trial['parameter_id'], int): + # for dealing with user customized trials + # skip for now + continue + self.recovered_trial_params[trial['parameter_id']] = trial['parameters'] + if previous_max_param_id < trial['parameter_id']: + previous_max_param_id = trial['parameter_id'] + self.recovered_max_param_id = previous_max_param_id + return previous_max_param_id + + def is_created_in_previous_exp(self, param_id: int) -> bool: + return param_id <= self.recovered_max_param_id + + def get_previous_param(self, param_id: int) -> dict: + return self.recovered_trial_params[param_id] \ No newline at end of file diff --git a/nni/runtime/msg_dispatcher.py b/nni/runtime/msg_dispatcher.py index b337d879db..dd62c1e9a1 100644 --- a/nni/runtime/msg_dispatcher.py +++ b/nni/runtime/msg_dispatcher.py @@ -120,15 +120,10 @@ def handle_import_data(self, data): self.tuner.import_data(data) def handle_add_customized_trial(self, data): + global _next_parameter_id # data: parameters - if not isinstance(data, list): - data = [data] - - for _ in data: - id_ = _create_parameter_id() - _customized_parameter_ids.add(id_) - - self.tuner.import_customized_data(data) + previous_max_param_id = self.recover_parameter_id(data) + _next_parameter_id = previous_max_param_id + 1 def handle_report_metric_data(self, data): """ @@ -137,6 +132,13 @@ def handle_report_metric_data(self, data): - 'value': metric value reported by nni.report_final_result() - 'type': report type, support {'FINAL', 'PERIODICAL'} """ + if self.is_created_in_previous_exp(data['parameter_id']): + if data['type'] == MetricType.FINAL: + # only deal with final metric using import data + param = self.get_previous_param(data['parameter_id']) + trial_data = [{'parameter': param, 'value': load(data['value'])}] + self.handle_import_data(trial_data) + return # metrics value is dumped as json string in trial, so we need to decode it here if 'value' in data: data['value'] = load(data['value']) @@ -166,6 +168,9 @@ def handle_trial_end(self, data): - event: the job's state - hyper_params: the hyperparameters generated and returned by tuner """ + if self.is_created_in_previous_exp(data['parameter_id']): + # The end of the recovered trial is ignored + return trial_job_id = data['trial_job_id'] _ended_trials.add(trial_job_id) if trial_job_id in _trial_history: diff --git a/nni/runtime/msg_dispatcher_base.py b/nni/runtime/msg_dispatcher_base.py index 99e6c71c91..ac6a3b37e3 100644 --- a/nni/runtime/msg_dispatcher_base.py +++ b/nni/runtime/msg_dispatcher_base.py @@ -30,6 +30,7 @@ class MsgDispatcherBase(Recoverable): """ def __init__(self, command_channel_url=None): + super().__init__() self.stopping = False if command_channel_url is None: command_channel_url = dispatcher_env_vars.NNI_TUNER_COMMAND_CHANNEL diff --git a/nni/tuner.py b/nni/tuner.py index c94e68043a..87b168db65 100644 --- a/nni/tuner.py +++ b/nni/tuner.py @@ -219,14 +219,6 @@ def import_data(self, data: list[TrialRecord]) -> None: # data: a list of dictionarys, each of which has at least two keys, 'parameter' and 'value' pass - def import_customized_data(self, data: list[TrialRecord]) -> None: - """ - Internal API under revising, not recommended for end users. - """ - # Import resume data for avoiding duplications - # data: a list of dictionarys, each of which has at least two keys, 'parameter_id' and 'parameters' - pass - def _on_exit(self) -> None: pass From ca4d86d994d730848213ea4eb95333eaa9e3b9bc Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 5 Sep 2022 21:37:52 +0800 Subject: [PATCH 70/77] update --- nni/algorithms/hpo/bohb_advisor/bohb_advisor.py | 2 +- nni/algorithms/hpo/hyperband_advisor.py | 2 +- nni/runtime/msg_dispatcher.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py index 56adad67f2..4904247aa4 100644 --- a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py +++ b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py @@ -648,7 +648,7 @@ def handle_trial_end(self, data): event: the job's state hyper_params: the hyperparameters (a string) generated and returned by tuner """ - if self.is_created_in_previous_exp(data['parameter_id']): + if self.is_created_in_previous_exp(nni.load(data['hyper_params'])['parameter_id']): # The end of the recovered trial is ignored return logger.debug('Tuner handle trial end, result is %s', data) diff --git a/nni/algorithms/hpo/hyperband_advisor.py b/nni/algorithms/hpo/hyperband_advisor.py index f60273fc27..22766c7b2f 100644 --- a/nni/algorithms/hpo/hyperband_advisor.py +++ b/nni/algorithms/hpo/hyperband_advisor.py @@ -521,7 +521,7 @@ def handle_trial_end(self, data): event: the job's state hyper_params: the hyperparameters (a string) generated and returned by tuner """ - if self.is_created_in_previous_exp(data['parameter_id']): + if self.is_created_in_previous_exp(nni.load(data['hyper_params'])['parameter_id']): # The end of the recovered trial is ignored return hyper_params = nni.load(data['hyper_params']) diff --git a/nni/runtime/msg_dispatcher.py b/nni/runtime/msg_dispatcher.py index dd62c1e9a1..50c9188d11 100644 --- a/nni/runtime/msg_dispatcher.py +++ b/nni/runtime/msg_dispatcher.py @@ -168,7 +168,7 @@ def handle_trial_end(self, data): - event: the job's state - hyper_params: the hyperparameters generated and returned by tuner """ - if self.is_created_in_previous_exp(data['parameter_id']): + if self.is_created_in_previous_exp(load(data['hyper_params'])['parameter_id']): # The end of the recovered trial is ignored return trial_job_id = data['trial_job_id'] From eadfeb9d0da3fab7b0dc86d235c3278fff55b5ea Mon Sep 17 00:00:00 2001 From: quzha Date: Mon, 5 Sep 2022 21:41:24 +0800 Subject: [PATCH 71/77] minor --- nni/algorithms/hpo/bohb_advisor/bohb_advisor.py | 4 ++-- nni/algorithms/hpo/hyperband_advisor.py | 4 ++-- nni/runtime/msg_dispatcher.py | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py index 4904247aa4..847c9e1a01 100644 --- a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py +++ b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py @@ -648,11 +648,11 @@ def handle_trial_end(self, data): event: the job's state hyper_params: the hyperparameters (a string) generated and returned by tuner """ - if self.is_created_in_previous_exp(nni.load(data['hyper_params'])['parameter_id']): + hyper_params = nni.load(data['hyper_params']) + if self.is_created_in_previous_exp(hyper_params['parameter_id']): # The end of the recovered trial is ignored return logger.debug('Tuner handle trial end, result is %s', data) - hyper_params = nni.load(data['hyper_params']) self._handle_trial_end(hyper_params['parameter_id']) if data['trial_job_id'] in self.job_id_para_id_map: del self.job_id_para_id_map[data['trial_job_id']] diff --git a/nni/algorithms/hpo/hyperband_advisor.py b/nni/algorithms/hpo/hyperband_advisor.py index 22766c7b2f..d5c4db05be 100644 --- a/nni/algorithms/hpo/hyperband_advisor.py +++ b/nni/algorithms/hpo/hyperband_advisor.py @@ -521,10 +521,10 @@ def handle_trial_end(self, data): event: the job's state hyper_params: the hyperparameters (a string) generated and returned by tuner """ - if self.is_created_in_previous_exp(nni.load(data['hyper_params'])['parameter_id']): + hyper_params = nni.load(data['hyper_params']) + if self.is_created_in_previous_exp(hyper_params['parameter_id']): # The end of the recovered trial is ignored return - hyper_params = nni.load(data['hyper_params']) self._handle_trial_end(hyper_params['parameter_id']) if data['trial_job_id'] in self.job_id_para_id_map: del self.job_id_para_id_map[data['trial_job_id']] diff --git a/nni/runtime/msg_dispatcher.py b/nni/runtime/msg_dispatcher.py index 50c9188d11..42ba4c9de8 100644 --- a/nni/runtime/msg_dispatcher.py +++ b/nni/runtime/msg_dispatcher.py @@ -168,7 +168,8 @@ def handle_trial_end(self, data): - event: the job's state - hyper_params: the hyperparameters generated and returned by tuner """ - if self.is_created_in_previous_exp(load(data['hyper_params'])['parameter_id']): + id_ = load(data['hyper_params'])['parameter_id'] + if self.is_created_in_previous_exp(id_): # The end of the recovered trial is ignored return trial_job_id = data['trial_job_id'] @@ -178,7 +179,7 @@ def handle_trial_end(self, data): if self.assessor is not None: self.assessor.trial_end(trial_job_id, data['event'] == 'SUCCEEDED') if self.tuner is not None: - self.tuner.trial_end(load(data['hyper_params'])['parameter_id'], data['event'] == 'SUCCEEDED') + self.tuner.trial_end(id_, data['event'] == 'SUCCEEDED') def _handle_final_metric_data(self, data): """Call tuner to process final results From 2439a570476d70d8e4dfcb90deb17790d0cb6223 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 6 Sep 2022 08:01:47 +0800 Subject: [PATCH 72/77] fix pylint --- nni/algorithms/hpo/hyperband_advisor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nni/algorithms/hpo/hyperband_advisor.py b/nni/algorithms/hpo/hyperband_advisor.py index d5c4db05be..cd53a24123 100644 --- a/nni/algorithms/hpo/hyperband_advisor.py +++ b/nni/algorithms/hpo/hyperband_advisor.py @@ -542,7 +542,7 @@ def handle_report_metric_data(self, data): Data type not supported """ if self.is_created_in_previous_exp(data['parameter_id']): - # do not support recovering the algorithm state + # do not support recovering the algorithm state return if 'value' in data: data['value'] = nni.load(data['value']) From c17256d4b8706e5d22526d1ed53f27a58815958d Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 6 Sep 2022 08:38:48 +0800 Subject: [PATCH 73/77] fix bug --- nni/nas/execution/common/integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py index 7f34021ad9..2990237e09 100644 --- a/nni/nas/execution/common/integration.py +++ b/nni/nas/execution/common/integration.py @@ -209,6 +209,7 @@ def mark_experiment_as_ending(self): self.send(CommandType.NoMoreTrialJobs, '') def handle_request_trial_jobs(self, num_trials): + self._advisor_initialized = True _logger.debug('Request trial jobs: %s', num_trials) self.invoke_callback('request_trial_jobs', num_trials) @@ -258,4 +259,3 @@ def handle_import_data(self, data): def handle_add_customized_trial(self, data): previous_max_param_id = self.recover_parameter_id(data) self.parameters_count = previous_max_param_id - self._advisor_initialized = True From 7d905ec9dc0f05c09818e8b5d5ddc662cbda03b7 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 6 Sep 2022 08:51:50 +0800 Subject: [PATCH 74/77] resolve comments --- nni/nas/execution/common/integration.py | 9 +++++---- nni/nas/execution/common/integration_api.py | 9 +-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/nni/nas/execution/common/integration.py b/nni/nas/execution/common/integration.py index 2990237e09..24b8f1f44f 100644 --- a/nni/nas/execution/common/integration.py +++ b/nni/nas/execution/common/integration.py @@ -4,6 +4,7 @@ __all__ = ['RetiariiAdvisor'] import logging +import time import os from typing import Any, Callable, Optional, Dict, List, Tuple @@ -67,10 +68,6 @@ def __init__(self, url: str): # this is for waiting the to-be-recovered trials from nnimanager self._advisor_initialized = False - @property - def initialized(self): - return self._advisor_initialized - def register_callbacks(self, callbacks: Dict[str, Callable[..., None]]): """ Register callbacks for NNI backend. @@ -172,6 +169,10 @@ def send_trial(self, parameters, placement_constraint=None): Parameter ID that is assigned to this parameter, which will be used for identification in future. """ + while not self._advisor_initialized: + _logger.info('Wait for RetiariiAdvisor to be initialized...') + time.sleep(0.5) + self.parameters_count += 1 if placement_constraint is None: placement_constraint = { diff --git a/nni/nas/execution/common/integration_api.py b/nni/nas/execution/common/integration_api.py index 37c381bf6b..58e5e966a6 100644 --- a/nni/nas/execution/common/integration_api.py +++ b/nni/nas/execution/common/integration_api.py @@ -6,15 +6,12 @@ '_advisor' # FIXME: hack to make it importable for tests ] -import logging -import time import warnings from typing import NewType, Any import nni from nni.common.version import version_check -_logger = logging.getLogger(__name__) # NOTE: this is only for passing flake8, we cannot import RetiariiAdvisor # because it would induce cycled import @@ -45,11 +42,7 @@ def send_trial(parameters: dict, placement_constraint=None) -> int: Send a new trial. Executed on tuner end. Return a ID that is the unique identifier for this trial. """ - advisor = get_advisor() - while not advisor.initialized: - _logger.info('Wait for RetiariiAdvisor to be initialized...') - time.sleep(0.5) - return advisor.send_trial(parameters, placement_constraint) + return get_advisor().send_trial(parameters, placement_constraint) def receive_trial_parameters() -> dict: """ From e679b4aaae83bc1a79c85257faec88533b101e43 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 6 Sep 2022 13:26:05 +0800 Subject: [PATCH 75/77] quick fix --- test/ut/nas/test_engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/ut/nas/test_engine.py b/test/ut/nas/test_engine.py index 2fb7949405..d91da0d00a 100644 --- a/test/ut/nas/test_engine.py +++ b/test/ut/nas/test_engine.py @@ -27,6 +27,7 @@ def test_base_execution_engine(self): nni.retiarii.integration_api._advisor = None nni.retiarii.execution.api._execution_engine = None advisor = RetiariiAdvisor('ws://_unittest_placeholder_') + advisor._advisor_initialized = True advisor._channel = LegacyCommandChannel() advisor.default_worker.start() advisor.assessor_worker.start() @@ -44,6 +45,7 @@ def test_py_execution_engine(self): nni.retiarii.integration_api._advisor = None nni.retiarii.execution.api._execution_engine = None advisor = RetiariiAdvisor('ws://_unittest_placeholder_') + advisor._advisor_initialized = True advisor._channel = LegacyCommandChannel() advisor.default_worker.start() advisor.assessor_worker.start() From b115deb92e236dedd655815bda4998dae984403f Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 6 Sep 2022 14:49:01 +0800 Subject: [PATCH 76/77] fix incomplete test data --- test/ut/sdk/test_assessor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/ut/sdk/test_assessor.py b/test/ut/sdk/test_assessor.py index 48c2c03324..03af5149ab 100644 --- a/test/ut/sdk/test_assessor.py +++ b/test/ut/sdk/test_assessor.py @@ -48,11 +48,11 @@ class AssessorTestCase(TestCase): def test_assessor(self): pass _reverse_io() - send(CommandType.ReportMetricData, '{"trial_job_id":"A","type":"PERIODICAL","sequence":0,"value":"2"}') - send(CommandType.ReportMetricData, '{"trial_job_id":"B","type":"PERIODICAL","sequence":0,"value":"2"}') - send(CommandType.ReportMetricData, '{"trial_job_id":"A","type":"PERIODICAL","sequence":1,"value":"3"}') - send(CommandType.TrialEnd, '{"trial_job_id":"A","event":"SYS_CANCELED"}') - send(CommandType.TrialEnd, '{"trial_job_id":"B","event":"SUCCEEDED"}') + send(CommandType.ReportMetricData, '{"parameter_id": 0,"trial_job_id":"A","type":"PERIODICAL","sequence":0,"value":"2"}') + send(CommandType.ReportMetricData, '{"parameter_id": 1,"trial_job_id":"B","type":"PERIODICAL","sequence":0,"value":"2"}') + send(CommandType.ReportMetricData, '{"parameter_id": 0,"trial_job_id":"A","type":"PERIODICAL","sequence":1,"value":"3"}') + send(CommandType.TrialEnd, '{"trial_job_id":"A","event":"SYS_CANCELED","hyper_params":"{\\"parameter_id\\": 0}"}') + send(CommandType.TrialEnd, '{"trial_job_id":"B","event":"SUCCEEDED","hyper_params":"{\\"parameter_id\\": 1}"}') send(CommandType.NewTrialJob, 'null') _restore_io() From ef10426ded0ad03c9c9508be4a5ebe33a1be0716 Mon Sep 17 00:00:00 2001 From: QuanluZhang Date: Tue, 4 Oct 2022 09:59:03 +0000 Subject: [PATCH 77/77] fix test of cgo engine --- test/algo/nas/test_cgo_engine.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/algo/nas/test_cgo_engine.py b/test/algo/nas/test_cgo_engine.py index 1da23cb33c..d5e089f0d1 100644 --- a/test/algo/nas/test_cgo_engine.py +++ b/test/algo/nas/test_cgo_engine.py @@ -319,6 +319,9 @@ def test_submit_models(self): advisor._channel = protocol.LegacyCommandChannel() advisor.default_worker.start() advisor.assessor_worker.start() + # this is because RetiariiAdvisor only works after `_advisor_initialized` becomes True. + # normally it becomes true when `handle_request_trial_jobs` is invoked + advisor._advisor_initialized = True remote = RemoteConfig(machine_list=[]) remote.machine_list.append(RemoteMachineConfig(host='test', gpu_indices=[0,1,2,3]))