Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

[nas] fix issue introduced by the trial recovery feature #5109

Merged
merged 92 commits into from
Oct 12, 2022
Merged
Show file tree
Hide file tree
Changes from 86 commits
Commits
Show all changes
92 commits
Select commit Hold shift + click to select a range
e766a22
update
QuanluZhang May 5, 2022
0a39a09
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang May 5, 2022
e074967
update
QuanluZhang May 5, 2022
1f4eeea
update
QuanluZhang May 5, 2022
b9c788b
update
QuanluZhang May 6, 2022
1e97e04
update
QuanluZhang May 6, 2022
c4be6d5
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang May 7, 2022
9e39e92
runnable
QuanluZhang May 7, 2022
6ebd774
update
QuanluZhang May 9, 2022
81ff246
update
QuanluZhang May 9, 2022
5d3e681
fix pylint
QuanluZhang May 10, 2022
9c580d5
fix pyright
QuanluZhang May 10, 2022
1c2f6de
update
QuanluZhang May 10, 2022
5086e0a
fix pyright
QuanluZhang May 10, 2022
db9f4e4
update
QuanluZhang May 10, 2022
9097175
minor
QuanluZhang May 10, 2022
879aa56
minor
QuanluZhang May 10, 2022
1d723ad
update
QuanluZhang May 10, 2022
3d9e10c
resolve some comments
QuanluZhang May 15, 2022
a8c15ea
resolve comments
QuanluZhang May 15, 2022
05d71fe
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang May 15, 2022
5f4b32c
minor
QuanluZhang May 15, 2022
6743fa7
pyright
QuanluZhang May 16, 2022
aa85f16
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang May 16, 2022
d1ea7f5
fix ut
QuanluZhang May 16, 2022
a3b55c2
minor
QuanluZhang May 16, 2022
f895116
fix cgo pipe
QuanluZhang May 16, 2022
33fd0b0
refactor
QuanluZhang May 16, 2022
7609983
fix pylint
QuanluZhang May 16, 2022
c51a520
minor
QuanluZhang May 16, 2022
7edef1a
fix pyright
QuanluZhang May 16, 2022
644cc72
resolve comments
QuanluZhang May 20, 2022
d610d43
resolve all the comments
QuanluZhang May 23, 2022
42824fd
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang May 23, 2022
6e9ca35
add comment
QuanluZhang May 23, 2022
3e4a84a
fix bug
QuanluZhang May 23, 2022
b6876eb
remove print
QuanluZhang May 23, 2022
1055399
remove trailing whitespace
QuanluZhang May 23, 2022
874d19b
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang Jun 6, 2022
aaab676
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang Jun 6, 2022
e0be690
fix not exist issue
QuanluZhang Jun 7, 2022
811e44e
add unittest
QuanluZhang Jun 8, 2022
bc849a1
add one more test
QuanluZhang Jun 8, 2022
2fbc261
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang Jun 8, 2022
77ae20b
resolve comments
QuanluZhang Jun 8, 2022
b664a0a
update
QuanluZhang Jun 9, 2022
ecf87c3
fix pipeline
QuanluZhang Jun 9, 2022
49fa868
add timeout for one test
QuanluZhang Jun 10, 2022
8d44079
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang Jun 15, 2022
fc99b43
release note
QuanluZhang Jun 15, 2022
a94cdf9
resolve comments
QuanluZhang Jun 21, 2022
11e458d
Merge branch 'v2.8' of github.com:microsoft/nni into dev-new-nas-expe…
QuanluZhang Jun 21, 2022
57a03f7
add doc links
QuanluZhang Jun 21, 2022
09ba2c5
update
QuanluZhang Jun 22, 2022
ad4e90e
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang Jul 3, 2022
72f3ff8
nas experiment view
QuanluZhang Jul 5, 2022
e0342eb
minor
QuanluZhang Jul 5, 2022
99374f3
support nas experiment resume
QuanluZhang Jul 7, 2022
cbaad0f
fix pylint
QuanluZhang Jul 7, 2022
0abe5a3
finish main functionality
QuanluZhang Jul 11, 2022
811ade7
fix pylint
QuanluZhang Jul 11, 2022
7b2d042
fix pyright
QuanluZhang Jul 11, 2022
6e2019c
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang Aug 6, 2022
36103d8
update
QuanluZhang Aug 6, 2022
12334d6
resolve comments
QuanluZhang Aug 6, 2022
49b1dc4
minor
QuanluZhang Aug 6, 2022
adf87a8
add ut
QuanluZhang Aug 9, 2022
6ecfd3a
minor
QuanluZhang Aug 9, 2022
8f20b2f
fix pylint
QuanluZhang Aug 9, 2022
ece0771
fix ut
QuanluZhang Aug 9, 2022
bb71804
fix ut
QuanluZhang Aug 9, 2022
3185609
minor
QuanluZhang Aug 9, 2022
0b4cc96
resolve comments
QuanluZhang Aug 9, 2022
e520c0d
resolve comments
QuanluZhang Aug 9, 2022
a6cb74d
fix pylint
QuanluZhang Aug 9, 2022
e6f0fea
resolve comments
QuanluZhang Aug 10, 2022
f4faa73
move test
QuanluZhang Aug 10, 2022
7572627
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang Aug 26, 2022
eab49a6
fix bug and add doc
QuanluZhang Aug 26, 2022
bccb126
resolve comment
QuanluZhang Aug 29, 2022
2b8d58e
Merge branch 'master' of github.com:microsoft/nni into dev-new-nas-ex…
QuanluZhang Sep 5, 2022
4d17783
fix issue
QuanluZhang Sep 5, 2022
097a781
fix
QuanluZhang Sep 5, 2022
ca4d86d
update
QuanluZhang Sep 5, 2022
eadfeb9
minor
QuanluZhang Sep 5, 2022
2439a57
fix pylint
QuanluZhang Sep 6, 2022
c17256d
fix bug
QuanluZhang Sep 6, 2022
7d905ec
resolve comments
QuanluZhang Sep 6, 2022
e679b4a
quick fix
QuanluZhang Sep 6, 2022
b115deb
fix incomplete test data
QuanluZhang Sep 6, 2022
abf7116
Merge branch 'master' of https://github.com/microsoft/nni into dev-ne…
QuanluZhang Oct 4, 2022
ef10426
fix test of cgo engine
QuanluZhang Oct 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,8 +648,11 @@ def handle_trial_end(self, data):
event: the job's state
hyper_params: the hyperparameters (a string) generated and returned by tuner
"""
logger.debug('Tuner handle trial end, result is %s', data)
hyper_params = nni.load(data['hyper_params'])
if self.is_created_in_previous_exp(hyper_params['parameter_id']):
# The end of the recovered trial is ignored
return
logger.debug('Tuner handle trial end, result is %s', data)
self._handle_trial_end(hyper_params['parameter_id'])
if data['trial_job_id'] in self.job_id_para_id_map:
del self.job_id_para_id_map[data['trial_job_id']]
Expand Down Expand Up @@ -695,6 +698,13 @@ def handle_report_metric_data(self, data):
ValueError
Data type not supported
"""
if self.is_created_in_previous_exp(data['parameter_id']):
if data['type'] == MetricType.FINAL:
# only deal with final metric using import data
param = self.get_previous_param(data['parameter_id'])
trial_data = [{'parameter': param, 'value': nni.load(data['value'])}]
self.handle_import_data(trial_data)
return
logger.debug('handle report metric data = %s', data)
if 'value' in data:
data['value'] = nni.load(data['value'])
Expand Down Expand Up @@ -752,7 +762,10 @@ def handle_report_metric_data(self, data):
'Data type not supported: {}'.format(data['type']))

def handle_add_customized_trial(self, data):
pass
global _next_parameter_id
# data: parameters
previous_max_param_id = self.recover_parameter_id(data)
_next_parameter_id = previous_max_param_id + 1

def handle_import_data(self, data):
"""Import additional data for tuning
Expand Down
11 changes: 10 additions & 1 deletion nni/algorithms/hpo/hyperband_advisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,9 @@ def handle_trial_end(self, data):
hyper_params: the hyperparameters (a string) generated and returned by tuner
"""
hyper_params = nni.load(data['hyper_params'])
if self.is_created_in_previous_exp(hyper_params['parameter_id']):
# The end of the recovered trial is ignored
return
self._handle_trial_end(hyper_params['parameter_id'])
if data['trial_job_id'] in self.job_id_para_id_map:
del self.job_id_para_id_map[data['trial_job_id']]
Expand All @@ -538,6 +541,9 @@ def handle_report_metric_data(self, data):
ValueError
Data type not supported
"""
if self.is_created_in_previous_exp(data['parameter_id']):
# do not support recovering the algorithm state
return
if 'value' in data:
data['value'] = nni.load(data['value'])
# multiphase? need to check
Expand Down Expand Up @@ -576,7 +582,10 @@ def handle_report_metric_data(self, data):
raise ValueError('Data type not supported: {}'.format(data['type']))

def handle_add_customized_trial(self, data):
pass
global _next_parameter_id
# data: parameters
previous_max_param_id = self.recover_parameter_id(data)
_next_parameter_id = previous_max_param_id + 1

def handle_import_data(self, data):
pass
13 changes: 0 additions & 13 deletions nni/algorithms/hpo/tpe_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,19 +218,6 @@ def import_data(self, data): # for resuming experiment
self.dedup.add_history(param)
_logger.info(f'Replayed {len(data)} FINISHED trials')

def import_customized_data(self, data): # for dedup customized / resumed
if isinstance(data, str):
data = nni.load(data)

for trial in data:
# {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...}
if isinstance(trial, str):
trial = nni.load(trial)
param = format_parameters(trial['parameters'], self.space)
self._running_params[trial['parameter_id']] = param
self.dedup.add_history(param)
_logger.info(f'Replayed {len(data)} RUNING/WAITING trials')

def suggest(args, rng, space, history):
params = {}
for key, spec in space.items():
Expand Down
25 changes: 22 additions & 3 deletions nni/nas/execution/common/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,16 @@ def __init__(self, url: str):
self.final_metric_callback: Optional[Callable[[int, MetricData], None]] = None

self.parameters_count = 0

# Sometimes messages arrive first before the callbacks get registered.
# Or in case that we allow engine to be absent during the experiment.
# Here we need to store the messages and invoke them later.
self.call_queue: List[Tuple[str, list]] = []
# this is for waiting the to-be-recovered trials from nnimanager
self._advisor_initialized = False

@property
def initialized(self):
return self._advisor_initialized

def register_callbacks(self, callbacks: Dict[str, Callable[..., None]]):
"""
Expand Down Expand Up @@ -212,10 +217,22 @@ def handle_update_search_space(self, data):
self.search_space = data

def handle_trial_end(self, data):
# TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring
id_ = nni.load(data['hyper_params'])['parameter_id']
if self.is_created_in_previous_exp(id_):
_logger.info('The end of the recovered trial %d is ignored', id_)
return
_logger.debug('Trial end: %s', data)
self.invoke_callback('trial_end', nni.load(data['hyper_params'])['parameter_id'], data['event'] == 'SUCCEEDED')
self.invoke_callback('trial_end', id_, data['event'] == 'SUCCEEDED')

def handle_report_metric_data(self, data):
# TODO: we should properly handle the trials in self._customized_parameter_ids instead of ignoring
if self.is_created_in_previous_exp(data['parameter_id']):
_logger.info('The metrics of the recovered trial %d are ignored', data['parameter_id'])
return
# NOTE: this part is not aligned with hpo tuners.
# in hpo tuners, trial_job_id is used for intermediate results handling
# parameter_id is for final result handling.
_logger.debug('Metric reported: %s', data)
if data['type'] == MetricType.REQUEST_PARAMETER:
raise ValueError('Request parameter not supported')
Expand All @@ -239,4 +256,6 @@ def handle_import_data(self, data):
pass

def handle_add_customized_trial(self, data):
pass
previous_max_param_id = self.recover_parameter_id(data)
self.parameters_count = previous_max_param_id
self._advisor_initialized = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible that handld_add_customized_trial is never called and _advisor_initialized is never set to true?

Copy link
Contributor Author

@QuanluZhang QuanluZhang Sep 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point! handle_add_customized_trial is called when experiment is resumed (even no trial should be recovered), but will not be called when experiment is created, this is a bug introduced by me...
I moved this flag to handle_request_trial_jobs, which means if trial is not requested, send_trial will be blocked. And "request trial" will always be sent by nnimanager

10 changes: 9 additions & 1 deletion nni/nas/execution/common/integration_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@
'_advisor' # FIXME: hack to make it importable for tests
]

import logging
import time
import warnings
from typing import NewType, Any

import nni
from nni.common.version import version_check

_logger = logging.getLogger(__name__)

# NOTE: this is only for passing flake8, we cannot import RetiariiAdvisor
# because it would induce cycled import
RetiariiAdvisor = NewType('RetiariiAdvisor', Any)
Expand Down Expand Up @@ -41,7 +45,11 @@ def send_trial(parameters: dict, placement_constraint=None) -> int:
Send a new trial. Executed on tuner end.
Return a ID that is the unique identifier for this trial.
"""
return get_advisor().send_trial(parameters, placement_constraint)
advisor = get_advisor()
while not advisor.initialized:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest putting this into RetiariiAdvisor.send_trial.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

_logger.info('Wait for RetiariiAdvisor to be initialized...')
time.sleep(0.5)
return advisor.send_trial(parameters, placement_constraint)

def receive_trial_parameters() -> dict:
"""
Expand Down
30 changes: 30 additions & 0 deletions nni/recoverable.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
from __future__ import annotations

import os
import nni

class Recoverable:
def __init__(self):
self.recovered_max_param_id = -1
self.recovered_trial_params = {}

def load_checkpoint(self) -> None:
pass
Expand All @@ -18,3 +22,29 @@ def get_checkpoint_path(self) -> str | None:
if ckp_path is not None and os.path.isdir(ckp_path):
return ckp_path
return None

def recover_parameter_id(self, data) -> int:
# this is for handling the resuming of the interrupted data: parameters
if not isinstance(data, list):
data = [data]

previous_max_param_id = 0
for trial in data:
# {'parameter_id': 0, 'parameter_source': 'resumed', 'parameters': {'batch_size': 128, ...}
if isinstance(trial, str):
trial = nni.load(trial)
if not isinstance(trial['parameter_id'], int):
# for dealing with user customized trials
# skip for now
continue
self.recovered_trial_params[trial['parameter_id']] = trial['parameters']
if previous_max_param_id < trial['parameter_id']:
previous_max_param_id = trial['parameter_id']
self.recovered_max_param_id = previous_max_param_id
return previous_max_param_id

def is_created_in_previous_exp(self, param_id: int) -> bool:
return param_id <= self.recovered_max_param_id

def get_previous_param(self, param_id: int) -> dict:
return self.recovered_trial_params[param_id]
24 changes: 15 additions & 9 deletions nni/runtime/msg_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,15 +120,10 @@ def handle_import_data(self, data):
self.tuner.import_data(data)

def handle_add_customized_trial(self, data):
global _next_parameter_id
# data: parameters
if not isinstance(data, list):
data = [data]

for _ in data:
id_ = _create_parameter_id()
_customized_parameter_ids.add(id_)

self.tuner.import_customized_data(data)
previous_max_param_id = self.recover_parameter_id(data)
_next_parameter_id = previous_max_param_id + 1

def handle_report_metric_data(self, data):
"""
Expand All @@ -137,6 +132,13 @@ def handle_report_metric_data(self, data):
- 'value': metric value reported by nni.report_final_result()
- 'type': report type, support {'FINAL', 'PERIODICAL'}
"""
if self.is_created_in_previous_exp(data['parameter_id']):
if data['type'] == MetricType.FINAL:
# only deal with final metric using import data
param = self.get_previous_param(data['parameter_id'])
trial_data = [{'parameter': param, 'value': load(data['value'])}]
self.handle_import_data(trial_data)
return
# metrics value is dumped as json string in trial, so we need to decode it here
if 'value' in data:
data['value'] = load(data['value'])
Expand Down Expand Up @@ -166,14 +168,18 @@ def handle_trial_end(self, data):
- event: the job's state
- hyper_params: the hyperparameters generated and returned by tuner
"""
id_ = load(data['hyper_params'])['parameter_id']
if self.is_created_in_previous_exp(id_):
# The end of the recovered trial is ignored
return
trial_job_id = data['trial_job_id']
_ended_trials.add(trial_job_id)
if trial_job_id in _trial_history:
_trial_history.pop(trial_job_id)
if self.assessor is not None:
self.assessor.trial_end(trial_job_id, data['event'] == 'SUCCEEDED')
if self.tuner is not None:
self.tuner.trial_end(load(data['hyper_params'])['parameter_id'], data['event'] == 'SUCCEEDED')
self.tuner.trial_end(id_, data['event'] == 'SUCCEEDED')

def _handle_final_metric_data(self, data):
"""Call tuner to process final results
Expand Down
1 change: 1 addition & 0 deletions nni/runtime/msg_dispatcher_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class MsgDispatcherBase(Recoverable):
"""

def __init__(self, command_channel_url=None):
super().__init__()
self.stopping = False
if command_channel_url is None:
command_channel_url = dispatcher_env_vars.NNI_TUNER_COMMAND_CHANNEL
Expand Down
8 changes: 0 additions & 8 deletions nni/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,14 +219,6 @@ def import_data(self, data: list[TrialRecord]) -> None:
# data: a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
pass

def import_customized_data(self, data: list[TrialRecord]) -> None:
"""
Internal API under revising, not recommended for end users.
"""
# Import resume data for avoiding duplications
# data: a list of dictionarys, each of which has at least two keys, 'parameter_id' and 'parameters'
pass

def _on_exit(self) -> None:
pass

Expand Down