From 43c594cc4c3545942ae53245fa3ad14b574077fd Mon Sep 17 00:00:00 2001 From: J-shang Date: Mon, 19 Apr 2021 14:00:09 +0800 Subject: [PATCH 1/3] fix logDir rewrite --- nni/tools/nnictl/launcher.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py index d170c9a012..98e0b617cb 100644 --- a/nni/tools/nnictl/launcher.py +++ b/nni/tools/nnictl/launcher.py @@ -319,7 +319,10 @@ def launch_experiment(args, experiment_config, mode, experiment_id, config_versi if package_name in ['SMAC', 'BOHB', 'PPOTuner']: print_error(f'The dependencies for {package_name} can be installed through pip install nni[{package_name}]') raise - log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else NNI_HOME_DIR + if config_version == 1: + log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else NNI_HOME_DIR + else: + log_dir = experiment_config['experimentWorkingDirectory'] if experiment_config.get('experimentWorkingDirectory') else NNI_HOME_DIR log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None #view experiment mode do not need debug function, when view an experiment, there will be no new logs created foreground = False @@ -471,8 +474,10 @@ def manage_stopped_experiment(args, mode): assert 'trainingService' in experiment_config or 'trainingServicePlatform' in experiment_config try: if 'trainingService' in experiment_config: + experiment_config['experimentWorkingDirectory'] = experiments_dict[args.id]['logDir'] launch_experiment(args, experiment_config, mode, experiment_id, 2) else: + experiment_config['logDir'] = experiments_dict[args.id]['logDir'] launch_experiment(args, experiment_config, mode, experiment_id, 1) except Exception as exception: restServerPid = Experiments().get_all_experiments().get(experiment_id, {}).get('pid') From b7d265bbdb1111496dfe8f93567dee93c5847979 Mon Sep 17 00:00:00 2001 From: J-shang Date: Mon, 19 Apr 2021 16:48:28 +0800 Subject: [PATCH 2/3] modify api --- nni/experiment/experiment.py | 63 +++++++++++++++++-------------- ts/nni_manager/core/nnimanager.ts | 1 + 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/nni/experiment/experiment.py b/nni/experiment/experiment.py index 9bf042ad78..cf6d7de5e4 100644 --- a/nni/experiment/experiment.py +++ b/nni/experiment/experiment.py @@ -40,7 +40,7 @@ def __init__(self, config: ExperimentConfig) -> None: """ Prepare an experiment. - Use `Experiment.start()` to launch it. + Use `Experiment.run()` to launch it. Parameters ---------- @@ -60,7 +60,7 @@ def __init__(self, training_service: Union[str, List[str]]) -> None: experiment.config.trial_command = 'python3 trial.py' experiment.config.machines.append(RemoteMachineConfig(ip=..., user_name=...)) ... - experiment.start(8080) + experiment.run(8080) Parameters ---------- @@ -149,27 +149,30 @@ def stop(self) -> None: self._proc = None _logger.info('Experiment stopped') - def run(self, port: int = 8080, debug: bool = False) -> bool: + def run(self, port: int = 8080, wait_completion: bool = True, debug: bool = False) -> bool: """ Run the experiment. - This function will block until experiment finish or error. + If wait_completion is True, this function will block until experiment finish or error. Return `True` when experiment done; or return `False` when experiment failed. + + Else if wait_completion is False, this function will non-block and return None immediately. """ self.start(port, debug) - try: - while True: - time.sleep(10) - status = self.get_status() - if status == 'DONE' or status == 'STOPPED': - return True - if status == 'ERROR': - return False - except KeyboardInterrupt: - _logger.warning('KeyboardInterrupt detected') - finally: - self.stop() + if wait_completion: + try: + while True: + time.sleep(10) + status = self.get_status() + if status == 'DONE' or status == 'STOPPED': + return True + if status == 'ERROR': + return False + except KeyboardInterrupt: + _logger.warning('KeyboardInterrupt detected') + finally: + self.stop() @classmethod def connect(cls, port: int): @@ -194,7 +197,7 @@ def connect(cls, port: int): return experiment @classmethod - def resume(cls, experiment_id: str, port: int, wait_completion: bool = True, debug: bool = False): + def resume(cls, experiment_id: str, port: int = 8080, wait_completion: bool = True, debug: bool = False): """ Resume a stopped experiment. @@ -212,14 +215,12 @@ def resume(cls, experiment_id: str, port: int, wait_completion: bool = True, deb experiment = Experiment() experiment.id = experiment_id experiment.mode = 'resume' - if wait_completion: - experiment.run(port, debug) - else: - experiment.start(port, debug) + experiment.run(port=port, wait_completion=wait_completion, debug=debug) + if not wait_completion: return experiment @classmethod - def view(cls, experiment_id: str, port: int, wait_completion: bool = True): + def view(cls, experiment_id: str, port: int = 8080, non_blocking: bool = False): """ View a stopped experiment. @@ -229,18 +230,24 @@ def view(cls, experiment_id: str, port: int, wait_completion: bool = True): The stopped experiment id. port The port of web UI. - wait_completion - If true, run in the foreground. If false, run in the background. + non_blocking + If false, run in the foreground. If true, run in the background. """ debug = False experiment = Experiment() experiment.id = experiment_id experiment.mode = 'view' - if wait_completion: - experiment.run(port, debug) - else: - experiment.start(port, debug) + experiment.start(port=port, debug=debug) + if non_blocking: return experiment + else: + try: + while True: + time.sleep(10) + except KeyboardInterrupt: + _logger.warning('KeyboardInterrupt detected') + finally: + experiment.stop() def get_status(self) -> str: """ diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts index f1bfbd501a..34e7edf13b 100644 --- a/ts/nni_manager/core/nnimanager.ts +++ b/ts/nni_manager/core/nnimanager.ts @@ -204,6 +204,7 @@ class NNIManager implements Manager { this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId); this.readonly = readonly; if (readonly) { + this.setStatus('STOPPED'); return Promise.resolve(); } From dea3eb5c6e0523287cfc4855c13413d72d6d8413 Mon Sep 17 00:00:00 2001 From: J-shang Date: Mon, 19 Apr 2021 17:27:42 +0800 Subject: [PATCH 3/3] add nnimanager status VIEWED --- ts/nni_manager/common/manager.ts | 2 +- ts/nni_manager/core/nnimanager.ts | 2 +- ts/webui/src/App.tsx | 2 +- ts/webui/src/components/modals/ExperimentSummaryPanel.tsx | 2 +- ts/webui/src/components/trial-detail/TableList.tsx | 2 +- ts/webui/src/static/const.ts | 1 + ts/webui/src/static/style/overview/probar.scss | 6 ++++-- 7 files changed, 10 insertions(+), 7 deletions(-) diff --git a/ts/nni_manager/common/manager.ts b/ts/nni_manager/common/manager.ts index a1a4c6a036..f94bf57d8b 100644 --- a/ts/nni_manager/common/manager.ts +++ b/ts/nni_manager/common/manager.ts @@ -8,7 +8,7 @@ import { TrialJobStatus, LogType } from './trainingService'; import { ExperimentConfig } from './experimentConfig'; type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM'; -type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL'; +type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL' | 'VIEWED'; namespace ExperimentStartUpMode { export const NEW = 'new'; export const RESUME = 'resume'; diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts index 34e7edf13b..ffc4da7548 100644 --- a/ts/nni_manager/core/nnimanager.ts +++ b/ts/nni_manager/core/nnimanager.ts @@ -204,7 +204,7 @@ class NNIManager implements Manager { this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId); this.readonly = readonly; if (readonly) { - this.setStatus('STOPPED'); + this.setStatus('VIEWED'); return Promise.resolve(); } diff --git a/ts/webui/src/App.tsx b/ts/webui/src/App.tsx index 00068888b0..fde6d54de9 100644 --- a/ts/webui/src/App.tsx +++ b/ts/webui/src/App.tsx @@ -237,7 +237,7 @@ class App extends React.Component<{}, AppState> { } // experiment status and /trial-jobs api's status could decide website update - if (['DONE', 'ERROR', 'STOPPED'].includes(EXPERIMENT.status) || TRIALS.jobListError()) { + if (['DONE', 'ERROR', 'STOPPED', 'VIEWED'].includes(EXPERIMENT.status) || TRIALS.jobListError()) { // experiment finished, refresh once more to ensure consistency this.setState(() => ({ interval: 0, isUpdate: false })); return; diff --git a/ts/webui/src/components/modals/ExperimentSummaryPanel.tsx b/ts/webui/src/components/modals/ExperimentSummaryPanel.tsx index 6ac983304c..2d6d51d939 100644 --- a/ts/webui/src/components/modals/ExperimentSummaryPanel.tsx +++ b/ts/webui/src/components/modals/ExperimentSummaryPanel.tsx @@ -54,7 +54,7 @@ class ExperimentSummaryPanel extends React.Component { private _renderOperationColumn(record: any): React.ReactNode { const runningTrial: boolean = ['RUNNING', 'UNKNOWN'].includes(record.status) ? false : true; - const disabledAddCustomizedTrial = ['DONE', 'ERROR', 'STOPPED'].includes(EXPERIMENT.status); + const disabledAddCustomizedTrial = ['DONE', 'ERROR', 'STOPPED', 'VIEWED'].includes(EXPERIMENT.status); return (