Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Fix sequence id issue on resuming experiment (#316)
Browse files Browse the repository at this point in the history
  • Loading branch information
chicm-ms authored Nov 2, 2018
1 parent 06710ab commit f56f688
Show file tree
Hide file tree
Showing 10 changed files with 63 additions and 13 deletions.
23 changes: 22 additions & 1 deletion src/nni_manager/common/experimentStartupInfo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class ExperimentStartupInfo {
private experimentId: string = '';
private newExperiment: boolean = true;
private initialized: boolean = false;
private initTrialSequenceID: number = 0;

public setStartupInfo(newExperiment: boolean, experimentId: string): void {
assert(!this.initialized);
Expand All @@ -48,6 +49,17 @@ class ExperimentStartupInfo {

return this.newExperiment;
}

public setInitTrialSequenceId(initSequenceId: number): void {
assert(this.initialized);
this.initTrialSequenceID = initSequenceId;
}

public getInitTrialSequenceId(): number {
assert(this.initialized);

return this.initTrialSequenceID;
}
}

function getExperimentId(): string {
Expand All @@ -58,8 +70,17 @@ function isNewExperiment(): boolean {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).isNewExperiment();
}

function setInitTrialSequenceId(initSequenceId: number): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setInitTrialSequenceId(initSequenceId);
}

function getInitTrialSequenceId(): number {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getInitTrialSequenceId();
}

function setExperimentStartupInfo(newExperiment: boolean, experimentId: string): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setStartupInfo(newExperiment, experimentId);
}

export { ExperimentStartupInfo, getExperimentId, isNewExperiment, setExperimentStartupInfo };
export { ExperimentStartupInfo, getExperimentId, isNewExperiment,
setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId };
1 change: 1 addition & 0 deletions src/nni_manager/common/manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ interface ExperimentProfile {
logDir?: string;
startTime?: number;
endTime?: number;
maxSequenceId: number;
revision: number;
}

Expand Down
13 changes: 12 additions & 1 deletion src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import { Deferred } from 'ts-deferred';
import * as component from '../common/component';
import { DataStore, MetricDataRecord, MetricType, TrialJobInfo } from '../common/datastore';
import { NNIError } from '../common/errors';
import { getExperimentId } from '../common/experimentStartupInfo';
import { getExperimentId, setInitTrialSequenceId } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log';
import {
ExperimentParams, ExperimentProfile, Manager,
Expand Down Expand Up @@ -152,6 +152,8 @@ class NNIManager implements Manager {
this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId);
const expParams: ExperimentParams = this.experimentProfile.params;

setInitTrialSequenceId(this.experimentProfile.maxSequenceId + 1);

// Set up multiphase config
if (expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
Expand Down Expand Up @@ -462,6 +464,7 @@ class NNIManager implements Manager {
}
};
const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
await this.storeMaxSequenceId(trialJobDetail.sequenceId);
this.trialJobs.set(trialJobDetail.id, Object.assign({}, trialJobDetail));
const trialJobDetailSnapshot: TrialJobDetail | undefined = this.trialJobs.get(trialJobDetail.id);
if (trialJobDetailSnapshot != undefined) {
Expand Down Expand Up @@ -593,6 +596,7 @@ class NNIManager implements Manager {
revision: 0,
execDuration: 0,
logDir: getLogDir(),
maxSequenceId: 0,
params: {
authorName: '',
experimentName: '',
Expand All @@ -609,6 +613,13 @@ class NNIManager implements Manager {
}
};
}

private async storeMaxSequenceId(sequenceId: number): Promise<void> {
if (sequenceId > this.experimentProfile.maxSequenceId) {
this.experimentProfile.maxSequenceId = sequenceId;
await this.storeExperimentProfile();
}
}
}

export { NNIManager };
5 changes: 4 additions & 1 deletion src/nni_manager/core/sqlDatabase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ create table ExperimentProfile (
startTime integer,
endTime integer,
logDir text,
maxSequenceId integer,
revision integer);
create index ExperimentProfile_id on ExperimentProfile(id);
`;
Expand All @@ -65,6 +66,7 @@ function loadExperimentProfile(row: any): ExperimentProfile {
startTime: row.startTime === null ? undefined : row.startTime,
endTime: row.endTime === null ? undefined : row.endTime,
logDir: row.logDir === null ? undefined : row.logDir,
maxSequenceId: row.maxSequenceId,
revision: row.revision
};
}
Expand Down Expand Up @@ -131,14 +133,15 @@ class SqlDB implements Database {
}

public storeExperimentProfile(exp: ExperimentProfile): Promise<void> {
const sql: string = 'insert into ExperimentProfile values (?,?,?,?,?,?,?)';
const sql: string = 'insert into ExperimentProfile values (?,?,?,?,?,?,?,?)';
const args: any[] = [
JSON.stringify(exp.params),
exp.id,
exp.execDuration,
exp.startTime === undefined ? null : exp.startTime,
exp.endTime === undefined ? null : exp.endTime,
exp.logDir === undefined ? null : exp.logDir,
exp.maxSequenceId,
exp.revision
];

Expand Down
1 change: 1 addition & 0 deletions src/nni_manager/core/test/dataStore.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ describe('Unit test for dataStore', () => {
execDuration: 0,
startTime: Date.now(),
endTime: Date.now(),
maxSequenceId: 0,
revision: 0
}
const id: string = profile.id;
Expand Down
8 changes: 4 additions & 4 deletions src/nni_manager/core/test/sqlDatabase.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ const expParams2: ExperimentParams = {
};

const profiles: ExperimentProfile[] = [
{ params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: undefined, revision: 1 },
{ params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 3 }
{ params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: undefined, revision: 1, maxSequenceId: 0 },
{ params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2, maxSequenceId: 0 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2, maxSequenceId: 0 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 3, maxSequenceId: 0 }
];

const events: TrialJobEventRecord[] = [
Expand Down
1 change: 1 addition & 0 deletions src/nni_manager/rest_server/test/mockedNNIManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ export class MockedNNIManager extends Manager {
execDuration: 0,
startTime: Date.now(),
endTime: Date.now(),
maxSequenceId: 0,
revision: 0
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ import { MethodNotImplementedError, NNIError, NNIErrorNames } from '../../common
import { getLogger, Logger } from '../../common/log';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { getInitTrialSequenceId } from '../../common/experimentStartupInfo';
import {
HostJobApplicationForm, JobApplicationForm, HyperParameters, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../common/utils';
import { file } from 'tmp';

const tkill = require('tree-kill');

Expand Down Expand Up @@ -111,7 +111,7 @@ class LocalTrainingService implements TrainingService {
this.initialized = false;
this.stopping = false;
this.log = getLogger();
this.trialSequenceId = 0;
this.trialSequenceId = -1;
}

public async run(): Promise<void> {
Expand Down Expand Up @@ -432,6 +432,10 @@ class LocalTrainingService implements TrainingService {
}

private generateSequenceId(): number {
if (this.trialSequenceId === -1) {
this.trialSequenceId = getInitTrialSequenceId();
}

return this.trialSequenceId++;
}

Expand Down
8 changes: 6 additions & 2 deletions src/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import * as request from 'request';

import { Deferred } from 'ts-deferred';
import { EventEmitter } from 'events';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo';
import { HDFSClientUtility } from './hdfsClientUtility'
import { MethodNotImplementedError } from '../../common/errors';
import { getLogger, Logger } from '../../common/log';
Expand Down Expand Up @@ -78,7 +78,7 @@ class PAITrainingService implements TrainingService {
this.experimentId = getExperimentId();
this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap);
this.hdfsDirPattern = 'hdfs://(?<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?<baseDir>/.*)?';
this.trialSequenceId = 0;
this.trialSequenceId = -1;
}

public async run(): Promise<void> {
Expand Down Expand Up @@ -454,6 +454,10 @@ class PAITrainingService implements TrainingService {
}

private generateSequenceId(): number {
if (this.trialSequenceId === -1) {
this.trialSequenceId = getInitTrialSequenceId();
}

return this.trialSequenceId++;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import { Deferred } from 'ts-deferred';
import { String } from 'typescript-string-operations';
import * as component from '../../common/component';
import { MethodNotImplementedError, NNIError, NNIErrorNames } from '../../common/errors';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { ObservableTimer } from '../../common/observableTimer';
import {
Expand Down Expand Up @@ -77,7 +77,7 @@ class RemoteMachineTrainingService implements TrainingService {
this.remoteExpRootDir = this.getRemoteExperimentRootDir();
this.timer = timer;
this.log = getLogger();
this.trialSequenceId = 0;
this.trialSequenceId = -1;
}

/**
Expand Down Expand Up @@ -607,6 +607,10 @@ class RemoteMachineTrainingService implements TrainingService {
}

private generateSequenceId(): number {
if (this.trialSequenceId === -1) {
this.trialSequenceId = getInitTrialSequenceId();
}

return this.trialSequenceId++;
}

Expand Down

0 comments on commit f56f688

Please sign in to comment.