Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge master #120

Merged
merged 2 commits into from
Jan 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/nni_manager/common/log.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class BufferSerialEmitter {
@component.Singleton
class Logger {
private DEFAULT_LOGFILE: string = path.join(getLogDir(), 'nnimanager.log');
private level: number = DEBUG;
private level: number = INFO;
private bufferSerialEmitter: BufferSerialEmitter;
private writable: Writable;

Expand Down Expand Up @@ -153,4 +153,4 @@ function getLogger(fileName?: string): Logger {
return component.get(Logger);
}

export { Logger, getLogger };
export { Logger, getLogger, logLevelNameMap };
5 changes: 3 additions & 2 deletions src/nni_manager/common/manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import { MetricDataRecord, MetricType, TrialJobInfo } from './datastore';
import { TrialJobStatus } from './trainingService';

type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM';
type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL';

interface ExperimentParams {
authorName: string;
Expand Down Expand Up @@ -85,7 +86,7 @@ interface TrialJobStatistics {
}

interface NNIManagerStatus {
status: 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL';
status: ExperimentStatus;
errors: string[];
}

Expand All @@ -109,4 +110,4 @@ abstract class Manager {
public abstract getStatus(): NNIManagerStatus;
}

export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus };
export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus, ExperimentStatus };
49 changes: 33 additions & 16 deletions src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import { NNIError } from '../common/errors';
import { getExperimentId, setInitTrialSequenceId } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log';
import {
ExperimentParams, ExperimentProfile, Manager,
ExperimentParams, ExperimentProfile, Manager, ExperimentStatus,
NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
} from '../common/manager';
import {
Expand Down Expand Up @@ -112,12 +112,13 @@ class NNIManager implements Manager {
}

public async cancelTrialJobByUser(trialJobId: string): Promise<void> {
this.log.info(`User cancelTrialJob: ${trialJobId}`);
await this.trainingService.cancelTrialJob(trialJobId);
await this.dataStore.storeTrialJobEvent('USER_TO_CANCEL', trialJobId, '');
}

public async startExperiment(expParams: ExperimentParams): Promise<string> {
this.log.debug(`Starting experiment: ${this.experimentProfile.id}`);
this.log.info(`Starting experiment: ${this.experimentProfile.id}`);
this.experimentProfile.params = expParams;
await this.storeExperimentProfile();
this.log.debug('Setup tuner...');
Expand All @@ -138,7 +139,7 @@ class NNIManager implements Manager {
checkpointDir);

this.experimentProfile.startTime = Date.now();
this.status.status = 'RUNNING';
this.setStatus('RUNNING');
await this.storeExperimentProfile();
this.run().catch((err: Error) => {
this.criticalError(err);
Expand All @@ -148,6 +149,7 @@ class NNIManager implements Manager {
}

public async resumeExperiment(): Promise<void> {
this.log.info(`Resuming experiment: ${this.experimentProfile.id}`);
//Fetch back the experiment profile
const experimentId: string = getExperimentId();
this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId);
Expand Down Expand Up @@ -185,7 +187,7 @@ class NNIManager implements Manager {
this.experimentProfile.endTime) {
delete this.experimentProfile.endTime;
}
this.status.status = 'RUNNING';
this.setStatus('RUNNING');

// TO DO: update database record for resume event
this.run().catch((err: Error) => {
Expand All @@ -200,6 +202,7 @@ class NNIManager implements Manager {
}

public async setClusterMetadata(key: string, value: string): Promise<void> {
this.log.info(`NNIManager setClusterMetadata, key: ${key}, value: ${value}`);
let timeoutId: NodeJS.Timer;
// TO DO: move timeout value to constants file
const delay1: Promise<{}> = new Promise((resolve: Function, reject: Function): void => {
Expand All @@ -223,10 +226,10 @@ class NNIManager implements Manager {
}

public async stopExperiment(): Promise<void> {
this.status.status = 'STOPPING';
this.log.info('Experiment done, cleaning up...');
this.setStatus('STOPPING');
this.log.info('Stopping experiment, cleaning up ...');
await this.experimentDoneCleanUp();
this.log.info('Experiment done.');
this.log.info('Experiment stopped.');
}

public async getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]> {
Expand Down Expand Up @@ -334,6 +337,7 @@ class NNIManager implements Manager {
if (trialJob.status === 'RUNNING' ||
trialJob.status === 'WAITING') {
try {
this.log.info(`cancelTrialJob: ${trialJob.id}`);
await this.trainingService.cancelTrialJob(trialJob.id);
} catch (error) {
// pid does not exist, do nothing here
Expand All @@ -343,7 +347,7 @@ class NNIManager implements Manager {
await this.trainingService.cleanUp();
this.experimentProfile.endTime = Date.now();
await this.storeExperimentProfile();
this.status.status = 'STOPPED';
this.setStatus('STOPPED');
}

private async periodicallyUpdateExecDuration(): Promise<void> {
Expand All @@ -365,8 +369,8 @@ class NNIManager implements Manager {
throw new Error('Error: tuner has not been setup');
}
while (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
await delay(1000 * 5);
this.dispatcher.sendCommand(PING);
await delay(1000 * 5);
}
}

Expand All @@ -379,6 +383,7 @@ class NNIManager implements Manager {
const trialJobDetail: TrialJobDetail = await this.trainingService.getTrialJob(trialJobId);
const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
if (oldTrialJobDetail !== undefined && oldTrialJobDetail.status !== trialJobDetail.status) {
this.log.info(`Trial job ${trialJobDetail.id} status changed from ${oldTrialJobDetail.status} to ${trialJobDetail.status}`);
this.trialJobs.set(trialJobId, Object.assign({}, trialJobDetail));
await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, undefined, trialJobDetail);
}
Expand Down Expand Up @@ -478,12 +483,12 @@ class NNIManager implements Manager {
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
if (this.status.status !== 'DONE') {
this.status.status = 'NO_MORE_TRIAL';
this.setStatus('NO_MORE_TRIAL');
waitSubmittedToFinish = this.currSubmittedTrialNum;

assert(allFinishedTrialJobNum <= waitSubmittedToFinish);
if (allFinishedTrialJobNum >= waitSubmittedToFinish) {
this.status.status = 'DONE';
this.setStatus('DONE');
this.experimentProfile.endTime = Date.now();
await this.storeExperimentProfile();
// write this log for travis CI
Expand All @@ -496,7 +501,7 @@ class NNIManager implements Manager {
await this.storeExperimentProfile();
}
if (this.status.status !== 'TUNER_NO_MORE_TRIAL') {
this.status.status = 'RUNNING';
this.setStatus('RUNNING');
}
for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) {
if (this.waitingTrials.length === 0 ||
Expand All @@ -515,6 +520,7 @@ class NNIManager implements Manager {
index: 0
}
};
this.log.info(`submitTrialJob: form: ${JSON.stringify(trialJobAppForm)}`);
const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
await this.storeMaxSequenceId(trialJobDetail.sequenceId);
this.trialJobs.set(trialJobDetail.id, Object.assign({}, trialJobDetail));
Expand Down Expand Up @@ -558,6 +564,7 @@ class NNIManager implements Manager {
}

private addEventListeners(): void {
this.log.info('Add event listeners');
// TO DO: cannot run this method more than once in one NNIManager instance
if (this.dispatcher === undefined) {
throw new Error('Error: tuner or job maintainer have not been setup');
Expand Down Expand Up @@ -585,6 +592,7 @@ class NNIManager implements Manager {
}

private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> {
this.log.debug(`NNIManager received trial job metrics: ${metric}`);
await this.dataStore.storeMetricData(metric.id, metric.data);
if (this.dispatcher === undefined) {
throw new Error('Error: tuner has not been setup');
Expand Down Expand Up @@ -612,7 +620,7 @@ class NNIManager implements Manager {
}

private async onTunerCommand(commandType: string, content: string): Promise<void> {
this.log.info(`Command from tuner: ${commandType}, ${content}`);
this.log.info(`NNIManaer received command from dispatcher: ${commandType}, ${content}`);
switch (commandType) {
case INITIALIZED:
// Tuner is intialized, search space is set, request tuner to generate hyper parameters
Expand All @@ -621,7 +629,7 @@ class NNIManager implements Manager {
case NEW_TRIAL_JOB:
if (this.status.status === 'TUNER_NO_MORE_TRIAL') {
this.log.warning('It is not supposed to receive more trials after NO_MORE_TRIAL is set');
this.status.status = 'RUNNING';
this.setStatus('RUNNING');
}
this.waitingTrials.push(content);
break;
Expand All @@ -637,14 +645,16 @@ class NNIManager implements Manager {
index: tunerCommand.parameter_index
}
};
this.log.info(`updateTrialJob: job id: ${tunerCommand.trial_job_id}, form: ${JSON.stringify(trialJobForm)}`);
await this.trainingService.updateTrialJob(tunerCommand.trial_job_id, trialJobForm);
await this.dataStore.storeTrialJobEvent(
'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined);
break;
case NO_MORE_TRIAL_JOBS:
this.status.status = 'TUNER_NO_MORE_TRIAL';
this.setStatus('TUNER_NO_MORE_TRIAL');
break;
case KILL_TRIAL_JOB:
this.log.info(`cancelTrialJob: ${JSON.parse(content)}`);
await this.trainingService.cancelTrialJob(JSON.parse(content), true);
break;
default:
Expand All @@ -662,7 +672,14 @@ class NNIManager implements Manager {
this.log.error(err.stack);
}
this.status.errors.push(err.message);
this.status.status = 'ERROR';
this.setStatus('ERROR');
}

private setStatus(status: ExperimentStatus): void {
if (status !== this.status.status) {
this.log.info(`Change NNIManager status from: ${this.status.status} to: ${status}`);
this.status.status = status;
}
}

private createEmptyExperimentProfile(): ExperimentProfile {
Expand Down
12 changes: 10 additions & 2 deletions src/nni_manager/core/sqlDatabase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import {
TrialJobEvent,
TrialJobEventRecord
} from '../common/datastore';
import { getLogger, Logger } from '../common/log';
import { ExperimentProfile } from '../common/manager';
import { TrialJobDetail } from '../common/trainingService';

Expand Down Expand Up @@ -95,13 +96,15 @@ function loadMetricData(row: any): MetricDataRecord {

class SqlDB implements Database {
private db!: sqlite3.Database;
private log: Logger = getLogger();
private initTask!: Deferred<void>;

public init(createNew: boolean, dbDir: string): Promise<void> {
if (this.initTask !== undefined) {
return this.initTask.promise;
}
this.initTask = new Deferred<void>();
this.log.debug(`Database directory: ${dbDir}`);
assert(fs.existsSync(dbDir));

// tslint:disable-next-line:no-bitwise
Expand Down Expand Up @@ -144,7 +147,7 @@ class SqlDB implements Database {
exp.maxSequenceId,
exp.revision
];

this.log.trace(`storeExperimentProfile: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<void> = new Deferred<void>();
this.db.run(sql, args, (err: Error | null) => { this.resolve(deferred, err); });

Expand All @@ -161,7 +164,7 @@ class SqlDB implements Database {
sql = 'select * from ExperimentProfile where id=? and revision=?';
args = [experimentId, revision];
}

this.log.trace(`queryExperimentProfile: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<ExperimentProfile[]> = new Deferred<ExperimentProfile[]>();
this.db.all(sql, args, (err: Error | null, rows: any[]) => {
this.resolve(deferred, err, rows, loadExperimentProfile);
Expand All @@ -183,6 +186,7 @@ class SqlDB implements Database {
const sequenceId: number | undefined = jobDetail === undefined ? undefined : jobDetail.sequenceId;
const args: any[] = [timestamp, trialJobId, event, hyperParameter, logPath, sequenceId];

this.log.trace(`storeTrialJobEvent: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<void> = new Deferred<void>();
this.db.run(sql, args, (err: Error | null) => { this.resolve(deferred, err); });

Expand All @@ -205,6 +209,7 @@ class SqlDB implements Database {
args = [trialJobId, event];
}

this.log.trace(`queryTrialJobEvent: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<TrialJobEventRecord[]> = new Deferred<TrialJobEventRecord[]>();
this.db.all(sql, args, (err: Error | null, rows: any[]) => {
this.resolve(deferred, err, rows, loadTrialJobEvent);
Expand All @@ -218,6 +223,7 @@ class SqlDB implements Database {
const json: MetricDataRecord = JSON.parse(data);
const args: any[] = [Date.now(), json.trialJobId, json.parameterId, json.type, json.sequence, JSON.stringify(json.data)];

this.log.trace(`storeMetricData: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<void> = new Deferred<void>();
this.db.run(sql, args, (err: Error | null) => { this.resolve(deferred, err); });

Expand All @@ -240,6 +246,7 @@ class SqlDB implements Database {
args = [trialJobId, metricType];
}

this.log.trace(`queryMetricData: SQL: ${sql}, args: ${JSON.stringify(args)}`);
const deferred: Deferred<MetricDataRecord[]> = new Deferred<MetricDataRecord[]>();
this.db.all(sql, args, (err: Error | null, rows: any[]) => {
this.resolve(deferred, err, rows, loadMetricData);
Expand Down Expand Up @@ -268,6 +275,7 @@ class SqlDB implements Database {
for (const row of (<any[]>rows)) {
data.push(rowLoader(row));
}
this.log.trace(`sql query result: ${JSON.stringify(data)}`);
(<Deferred<T[]>>deferred).resolve(data);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import * as component from './common/component';
import * as fs from 'fs';
import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger } from './common/log';
import { getLogger, Logger, logLevelNameMap } from './common/log';
import { Manager } from './common/manager';
import { TrainingService } from './common/trainingService';
import { parseArg, uniqueString, mkDirP, getLogDir } from './common/utils';
Expand Down Expand Up @@ -111,7 +111,7 @@ if (logDir.length > 0) {
}

const logLevel: string = parseArg(['--log_level', '-ll']);
if (logLevel.length > 0 && !['debug', 'info', 'error', 'warning', 'critical'].includes(logLevel)) {
if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) {
console.log(`FATAL: invalid log_level: ${logLevel}`);
}

Expand Down
Loading