Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Detect tuner / assessor failure #635

Merged
merged 20 commits into from
Jan 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/nni_manager/core/commands.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ const UPDATE_SEARCH_SPACE = 'SS';
const ADD_CUSTOMIZED_TRIAL_JOB = 'AD';
const TRIAL_END = 'EN';
const TERMINATE = 'TE';
const PING = 'PI';

const INITIALIZED = 'ID';
const NEW_TRIAL_JOB = 'TR';
Expand All @@ -39,6 +40,7 @@ const TUNER_COMMANDS: Set<string> = new Set([
UPDATE_SEARCH_SPACE,
ADD_CUSTOMIZED_TRIAL_JOB,
TERMINATE,
PING,

INITIALIZED,
NEW_TRIAL_JOB,
Expand All @@ -63,6 +65,7 @@ export {
ADD_CUSTOMIZED_TRIAL_JOB,
TRIAL_END,
TERMINATE,
PING,
INITIALIZED,
NEW_TRIAL_JOB,
NO_MORE_TRIAL_JOBS,
Expand Down
19 changes: 16 additions & 3 deletions src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ import {
import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService';
import { delay, getLogDir, getCheckpointDir, getMsgDispatcherCommand, mkDirP } from '../common/utils';
import { delay, getCheckpointDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils';
import {
ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS,
ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING,
REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
} from './commands';
import { createDispatcherInterface, IpcInterface } from './ipcInterface';

/**
* NNIManager
* NNIManager which implements Manager interface
*/
class NNIManager implements Manager {
private trainingService: TrainingService;
Expand Down Expand Up @@ -360,6 +360,16 @@ class NNIManager implements Manager {
}
}

private async pingDispatcher(): Promise<void> {
if (this.dispatcher === undefined) {
throw new Error('Error: tuner has not been setup');
}
while (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
await delay(1000 * 5);
this.dispatcher.sendCommand(PING);
}
}

private async requestTrialJobsStatus(): Promise<number> {
let finishedTrialJobNum: number = 0;
if (this.dispatcher === undefined) {
Expand Down Expand Up @@ -536,6 +546,9 @@ class NNIManager implements Manager {

await Promise.all([
this.periodicallyUpdateExecDuration(),
this.pingDispatcher().catch((err: Error) => {
throw new NNIError('Dispatcher error', `Dispatcher error: ${err.message}`, err);
}),
this.trainingService.run().catch((err: Error) => {
throw new NNIError('Training service error', `Training service error: ${err.message}`, err);
}),
Expand Down
7 changes: 6 additions & 1 deletion src/sdk/pynni/nni/msg_dispatcher_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def handle_request(self, request):

_logger.debug('handle request: command: [{}], data: [{}]'.format(command, data))

data = json_tricks.loads(data)
if data:
data = json_tricks.loads(data)

command_handlers = {
# Tunner commands:
Expand All @@ -96,12 +97,16 @@ def handle_request(self, request):
CommandType.ReportMetricData: self.handle_report_metric_data,

CommandType.TrialEnd: self.handle_trial_end,
CommandType.Ping: self.handle_ping,
}
if command not in command_handlers:
raise AssertionError('Unsupported command: {}'.format(command))

return command_handlers[command](data)

def handle_ping(self, data):
pass

def handle_initialize(self, data):
raise NotImplementedError('handle_initialize not implemented')

Expand Down
1 change: 1 addition & 0 deletions src/sdk/pynni/nni/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class CommandType(Enum):
AddCustomizedTrialJob = b'AD'
TrialEnd = b'EN'
Terminate = b'TE'
Ping = b'PI'

# out
Initialized = b'ID'
Expand Down