From 482ef514c99153e7ae05ea8ca982ae63fb580728 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Thu, 23 Jul 2020 14:20:22 +0800 Subject: [PATCH 01/30] support display log on local mode --- src/nni_manager/common/manager.ts | 4 +++- src/nni_manager/common/trainingService.ts | 5 ++++- src/nni_manager/core/nnimanager.ts | 6 +++++- src/nni_manager/core/test/mockedTrainingService.ts | 6 +++++- src/nni_manager/rest_server/restHandler.ts | 11 +++++++++++ .../rest_server/test/mockedNNIManager.ts | 5 ++++- .../training_service/dlts/dltsTrainingService.ts | 7 ++++++- .../kubernetes/kubernetesTrainingService.ts | 7 ++++++- .../training_service/local/localTrainingService.ts | 14 +++++++++++++- .../training_service/pai/paiTrainingService.ts | 7 ++++++- .../pai/paiYarn/paiYarnTrainingService.ts | 3 ++- .../remote_machine/remoteMachineTrainingService.ts | 13 +++++++++++-- 12 files changed, 76 insertions(+), 12 deletions(-) diff --git a/src/nni_manager/common/manager.ts b/src/nni_manager/common/manager.ts index f37745de16..c003598abc 100644 --- a/src/nni_manager/common/manager.ts +++ b/src/nni_manager/common/manager.ts @@ -4,7 +4,7 @@ 'use strict'; import { MetricDataRecord, MetricType, TrialJobInfo } from './datastore'; -import { TrialJobStatus } from './trainingService'; +import { TrialJobStatus, LogType } from './trainingService'; type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM'; type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL'; @@ -101,6 +101,8 @@ abstract class Manager { public abstract getMetricDataByRange(minSeqId: number, maxSeqId: number): Promise; public abstract getLatestMetricData(): Promise; + public abstract getTrialLog(trialJobId: string, logType: LogType): Promise; + public abstract getTrialJobStatistics(): Promise; public abstract getStatus(): NNIManagerStatus; } diff --git a/src/nni_manager/common/trainingService.ts b/src/nni_manager/common/trainingService.ts index 83bd51e884..d5e22a4648 100644 --- a/src/nni_manager/common/trainingService.ts +++ b/src/nni_manager/common/trainingService.ts @@ -8,6 +8,8 @@ */ type TrialJobStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED' | 'SYS_CANCELED' | 'EARLY_STOPPED'; +type LogType = 'TRIAL_LOG' | 'TRIAL_STDERR'; + interface TrainingServiceMetadata { readonly key: string; readonly value: string; @@ -79,6 +81,7 @@ abstract class TrainingService { public abstract updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise; public abstract get isMultiPhaseJobSupported(): boolean; public abstract cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean): Promise; + public abstract getTrialLog(trialJobId: string, logType: LogType): Promise; public abstract setClusterMetadata(key: string, value: string): Promise; public abstract getClusterMetadata(key: string): Promise; public abstract cleanUp(): Promise; @@ -98,5 +101,5 @@ class NNIManagerIpConfig { export { TrainingService, TrainingServiceError, TrialJobStatus, TrialJobApplicationForm, TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters, - NNIManagerIpConfig + NNIManagerIpConfig, LogType }; diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index 038fe9ef9a..ad243f4835 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -16,7 +16,7 @@ import { NNIManagerStatus, ProfileUpdateType, TrialJobStatistics } from '../common/manager'; import { - TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus + TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType } from '../common/trainingService'; import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getTunerProc, getLogLevel, isAlive, killPid } from '../common/utils'; import { @@ -325,6 +325,10 @@ class NNIManager implements Manager { // FIXME: unit test } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + return this.trainingService.getTrialLog(trialJobId, logType); + } + public getExperimentProfile(): Promise { // TO DO: using Promise.resolve() const deferred: Deferred = new Deferred(); diff --git a/src/nni_manager/core/test/mockedTrainingService.ts b/src/nni_manager/core/test/mockedTrainingService.ts index 546a36e494..5dfec86427 100644 --- a/src/nni_manager/core/test/mockedTrainingService.ts +++ b/src/nni_manager/core/test/mockedTrainingService.ts @@ -7,7 +7,7 @@ import { Deferred } from 'ts-deferred'; import { Provider } from 'typescript-ioc'; import { MethodNotImplementedError } from '../../common/errors'; -import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; +import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; const testTrainingServiceProvider: Provider = { get: () => { return new MockedTrainingService(); } @@ -63,6 +63,10 @@ class MockedTrainingService extends TrainingService { return deferred.promise; } + public getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + async run(): Promise { } diff --git a/src/nni_manager/rest_server/restHandler.ts b/src/nni_manager/rest_server/restHandler.ts index 457f154b69..b91fbeeb43 100644 --- a/src/nni_manager/rest_server/restHandler.ts +++ b/src/nni_manager/rest_server/restHandler.ts @@ -57,6 +57,7 @@ class NNIRestHandler { this.getMetricData(router); this.getMetricDataByRange(router); this.getLatestMetricData(router); + this.getTrialLog(router); this.exportData(router); // Express-joi-validator configuration @@ -268,6 +269,16 @@ class NNIRestHandler { }); } + private getTrialLog(router: Router): void { + router.get('/trial-log/:id/:type', async(req: Request, res: Response) => { + this.nniManager.getTrialLog(req.params.id, req.params.type).then((log: string) => { + res.send(log); + }).catch((err: Error) => { + this.handleError(err, res); + }); + }); + } + private exportData(router: Router): void { router.get('/export-data', (req: Request, res: Response) => { this.nniManager.exportData().then((exportedData: string) => { diff --git a/src/nni_manager/rest_server/test/mockedNNIManager.ts b/src/nni_manager/rest_server/test/mockedNNIManager.ts index 5c8bc267b7..e45819d6cb 100644 --- a/src/nni_manager/rest_server/test/mockedNNIManager.ts +++ b/src/nni_manager/rest_server/test/mockedNNIManager.ts @@ -13,7 +13,7 @@ import { TrialJobStatistics, NNIManagerStatus } from '../../common/manager'; import { - TrialJobApplicationForm, TrialJobDetail, TrialJobStatus + TrialJobApplicationForm, TrialJobDetail, TrialJobStatus, LogType } from '../../common/trainingService'; export const testManagerProvider: Provider = { @@ -118,6 +118,9 @@ export class MockedNNIManager extends Manager { public getLatestMetricData(): Promise { throw new MethodNotImplementedError(); } + public getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } public getExperimentProfile(): Promise { const profile: ExperimentProfile = { params: { diff --git a/src/nni_manager/training_service/dlts/dltsTrainingService.ts b/src/nni_manager/training_service/dlts/dltsTrainingService.ts index ba707fbb13..ba3b02c9e6 100644 --- a/src/nni_manager/training_service/dlts/dltsTrainingService.ts +++ b/src/nni_manager/training_service/dlts/dltsTrainingService.ts @@ -12,9 +12,10 @@ import { EventEmitter } from 'events'; import { String } from 'typescript-string-operations'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; +import { MethodNotImplementedError } from '../../common/errors'; import { NNIManagerIpConfig, TrainingService, - TrialJobApplicationForm, TrialJobDetail, TrialJobMetric + TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { DLTS_TRIAL_COMMAND_FORMAT } from './dltsData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; @@ -246,6 +247,10 @@ class DLTSTrainingService implements TrainingService { return trialJob } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { this.metricsEmitter.on('metric', listener); } diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index f21ac9ad69..472696563d 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -12,8 +12,9 @@ import { Base64 } from 'js-base64'; import { String } from 'typescript-string-operations'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; +import { MethodNotImplementedError } from '../../common/errors'; import { - NNIManagerIpConfig, TrialJobDetail, TrialJobMetric + NNIManagerIpConfig, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils'; import { AzureStorageClientUtility } from './azureStorageClientUtils'; @@ -98,6 +99,10 @@ abstract class KubernetesTrainingService { return Promise.resolve(kubernetesTrialJob); } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { this.metricsEmitter.on('metric', listener); } diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 71a1c5719c..98ec8c10a7 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -14,7 +14,7 @@ import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { HyperParameters, TrainingService, TrialJobApplicationForm, - TrialJobDetail, TrialJobMetric, TrialJobStatus + TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType } from '../../common/trainingService'; import { delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, getNewLine, isAlive, uniqueString @@ -184,6 +184,18 @@ class LocalTrainingService implements TrainingService { return trialJob; } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + let logPath: string; + if (logType === 'TRIAL_LOG') { + logPath = path.join(this.rootDir, 'trials', trialJobId, 'trial.log'); + } else if (logType === 'TRIAL_STDERR') { + logPath = path.join(this.rootDir, 'trials', trialJobId, 'stderr'); + } else { + throw new Error('unexpected log type'); + } + return fs.promises.readFile(logPath, 'utf8'); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { this.eventEmitter.on('metric', listener); } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index e26c16ecee..c3c59a2cbd 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -11,9 +11,10 @@ import { EventEmitter } from 'events'; import { Deferred } from 'ts-deferred'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; +import { MethodNotImplementedError } from '../../common/errors'; import { NNIManagerIpConfig, TrainingService, - TrialJobApplicationForm, TrialJobDetail, TrialJobMetric + TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay } from '../../common/utils'; import { PAIJobInfoCollector } from './paiJobInfoCollector'; @@ -117,6 +118,10 @@ abstract class PAITrainingService implements TrainingService { return jobs; } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public async getTrialJob(trialJobId: string): Promise { if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts index 13bcdfd20f..b0cbe955d7 100644 --- a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts @@ -10,9 +10,10 @@ import * as component from '../../../common/component'; import { Deferred } from 'ts-deferred'; import { String } from 'typescript-string-operations'; +import { MethodNotImplementedError } from '../../../common/errors'; import { HyperParameters, NNIManagerIpConfig, - TrialJobApplicationForm, TrialJobDetail + TrialJobApplicationForm, TrialJobDetail, LogType } from '../../../common/trainingService'; import { generateParamFileName, diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 0fa8e1305b..604da2e0dd 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -9,13 +9,13 @@ import * as fs from 'fs'; import * as path from 'path'; import { Deferred } from 'ts-deferred'; import * as component from '../../common/component'; -import { NNIError, NNIErrorNames } from '../../common/errors'; +import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { ObservableTimer } from '../../common/observableTimer'; import { HyperParameters, NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, - TrialJobDetail, TrialJobMetric + TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus, @@ -173,6 +173,15 @@ class RemoteMachineTrainingService implements TrainingService { } } + /** + * Get trial job log + * @param trialJobId ID of trial job + * @param logType 'TRIAL_LOG' | 'TRIAL_STDERR' + */ + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + /** * Add job metrics listener * @param listener callback listener From dff2c2b366738d0f2599b60fbc936af4f0f3889c Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Thu, 23 Jul 2020 14:40:22 +0800 Subject: [PATCH 02/30] remove useless import --- .../training_service/pai/paiYarn/paiYarnTrainingService.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts index b0cbe955d7..13bcdfd20f 100644 --- a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts @@ -10,10 +10,9 @@ import * as component from '../../../common/component'; import { Deferred } from 'ts-deferred'; import { String } from 'typescript-string-operations'; -import { MethodNotImplementedError } from '../../../common/errors'; import { HyperParameters, NNIManagerIpConfig, - TrialJobApplicationForm, TrialJobDetail, LogType + TrialJobApplicationForm, TrialJobDetail } from '../../../common/trainingService'; import { generateParamFileName, From 7315daca768f175cc77b0b860b96163a6bac6d1d Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Thu, 23 Jul 2020 15:17:14 +0800 Subject: [PATCH 03/30] implement abstract method for reuse mode --- .../training_service/reusable/routerTrainingService.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 1fd28604be..363565214f 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -6,7 +6,8 @@ import { Container, Scope } from 'typescript-ioc'; import * as component from '../../common/component'; import { getLogger, Logger } from '../../common/log'; -import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; +import { MethodNotImplementedError } from '../../common/errors' +import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { PAIClusterConfig } from '../pai/paiConfig'; @@ -47,6 +48,10 @@ class RouterTrainingService implements TrainingService { return await this.internalTrainingService.getTrialJob(trialJobId); } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { if (this.internalTrainingService === undefined) { throw new Error("TrainingService is not assigned!"); From 520d9ad34a4f92f4bb37f2310a0a6dd94ff8aa69 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Thu, 23 Jul 2020 16:16:56 +0800 Subject: [PATCH 04/30] implement abstract method --- .../training_service/reusable/trialDispatcher.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 156909e129..f8797fce79 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -11,7 +11,8 @@ import { String } from 'typescript-string-operations'; import * as component from '../../common/component'; import { getBasePort, getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; -import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; +import { MethodNotImplementedError } from '../../common/errors' +import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus, LogType } from '../../common/trainingService'; import { delay, getExperimentRootDir, getLogLevel, getVersion, mkDirPSync, uniqueString, getIPV4Address } from '../../common/utils'; import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END, VERSION_CHECK } from '../../core/commands'; import { GPUSummary } from '../../training_service/common/gpuData'; @@ -94,6 +95,10 @@ class TrialDispatcher implements TrainingService { return trial; } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public async submitTrialJob(form: TrialJobApplicationForm): Promise { if (this.trialConfig === undefined) { throw new Error(`trialConfig not initialized!`); From a73db7157ff58967bfa1a31af56a06e325b78b87 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Fri, 24 Jul 2020 13:55:09 +0800 Subject: [PATCH 05/30] fix eslint error --- .../training_service/dlts/dltsTrainingService.ts | 2 +- .../kubernetes/kubernetesTrainingService.ts | 2 +- src/nni_manager/training_service/pai/paiTrainingService.ts | 2 +- .../remote_machine/remoteMachineTrainingService.ts | 6 +++--- .../training_service/reusable/routerTrainingService.ts | 2 +- .../training_service/reusable/trialDispatcher.ts | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/nni_manager/training_service/dlts/dltsTrainingService.ts b/src/nni_manager/training_service/dlts/dltsTrainingService.ts index ba3b02c9e6..30d8fbcf8d 100644 --- a/src/nni_manager/training_service/dlts/dltsTrainingService.ts +++ b/src/nni_manager/training_service/dlts/dltsTrainingService.ts @@ -247,7 +247,7 @@ class DLTSTrainingService implements TrainingService { return trialJob } - public async getTrialLog(trialJobId: string, logType: LogType): Promise { + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { throw new MethodNotImplementedError(); } diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index 472696563d..11a54c453c 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -99,7 +99,7 @@ abstract class KubernetesTrainingService { return Promise.resolve(kubernetesTrialJob); } - public async getTrialLog(trialJobId: string, logType: LogType): Promise { + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { throw new MethodNotImplementedError(); } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index c3c59a2cbd..67d7b1edeb 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -118,7 +118,7 @@ abstract class PAITrainingService implements TrainingService { return jobs; } - public async getTrialLog(trialJobId: string, logType: LogType): Promise { + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { throw new MethodNotImplementedError(); } diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 604da2e0dd..d361ab2944 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -175,10 +175,10 @@ class RemoteMachineTrainingService implements TrainingService { /** * Get trial job log - * @param trialJobId ID of trial job - * @param logType 'TRIAL_LOG' | 'TRIAL_STDERR' + * @param _trialJobId ID of trial job + * @param _logType 'TRIAL_LOG' | 'TRIAL_STDERR' */ - public async getTrialLog(trialJobId: string, logType: LogType): Promise { + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { throw new MethodNotImplementedError(); } diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 363565214f..1e3b75cc86 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -48,7 +48,7 @@ class RouterTrainingService implements TrainingService { return await this.internalTrainingService.getTrialJob(trialJobId); } - public async getTrialLog(trialJobId: string, logType: LogType): Promise { + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { throw new MethodNotImplementedError(); } diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index f8797fce79..b63460d699 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -95,7 +95,7 @@ class TrialDispatcher implements TrainingService { return trial; } - public async getTrialLog(trialJobId: string, logType: LogType): Promise { + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { throw new MethodNotImplementedError(); } From 7b45ecf728da88d364ad7f95522d2fea5397d641 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Sat, 25 Jul 2020 02:30:34 +0800 Subject: [PATCH 06/30] add frontend support --- .../src/components/public-child/OpenRow.tsx | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/src/webui/src/components/public-child/OpenRow.tsx b/src/webui/src/components/public-child/OpenRow.tsx index a0c6c274c1..44db47fe7f 100644 --- a/src/webui/src/components/public-child/OpenRow.tsx +++ b/src/webui/src/components/public-child/OpenRow.tsx @@ -1,7 +1,10 @@ import * as React from 'react'; +// import axios from 'axios'; import * as copy from 'copy-to-clipboard'; import { Stack, PrimaryButton, Pivot, PivotItem } from 'office-ui-fabric-react'; import { Trial } from '../../static/model/trial'; +import { MANAGER_IP } from '../../static/const'; +// import { downFile } from '../../static/function'; import { EXPERIMENT, TRIALS } from '../../static/datamodel'; import JSONTree from 'react-json-tree'; import PaiTrialLog from '../public-child/PaiTrialLog'; @@ -9,6 +12,7 @@ import TrialLog from '../public-child/TrialLog'; import MessageInfo from '../Modals/MessageInfo'; import '../../static/style/overview.scss'; import '../../static/style/copyParameter.scss'; +import '../../static/style/openRow.scss'; interface OpenRowProps { trialId: string; @@ -55,9 +59,33 @@ class OpenRow extends React.Component { } } + // downloadTrialLog = (): void => { + // const { trialId } = this.props; + // // download this trial's log + // const trialLogPromise = axios.get(`${MANAGER_IP}/trial-log/${trialId}/TRIAL_LOG`); + // const stderrPromise = axios.get(`${MANAGER_IP}/trial-log/${trialId}/TRIAL_STDERR`); + // trialLogPromise.then(res => { + // if (res.status === 200) { + // // start to download + // downFile(res.data, `${trialId}.log`); + // } + // }); + // stderrPromise.then(res => { + // if (res.status === 200) { + // // start to download + // downFile(res.data, `${trialId}.stderr`); + // } + // }); + // } + + openTrialLog = (type: string): void => { + window.open(`${MANAGER_IP}/trial-log/${this.props.trialId}/${type}`); + } + render(): React.ReactNode { const { isHidenInfo, typeInfo, info } = this.state; const trialId = this.props.trialId; + console.info(trialId); // eslint-disable-line const trial = TRIALS.getTrial(trialId); const logPathRow = trial.info.logPath || 'This trial\'s log path is not available.'; return ( @@ -105,7 +133,23 @@ class OpenRow extends React.Component { logCollection={EXPERIMENT.logCollectionEnabled} /> : - +
+ + {/* view each trial log in drawer*/} +
+
+ + +
+
+
} From 6cd5e8fbdea698f7e326816fbfd8ece4af76ee3a Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Sat, 25 Jul 2020 02:52:47 +0800 Subject: [PATCH 07/30] add frontend support --- src/webui/src/static/model/trialmanager.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/webui/src/static/model/trialmanager.ts b/src/webui/src/static/model/trialmanager.ts index 7ef5603196..c4ef862547 100644 --- a/src/webui/src/static/model/trialmanager.ts +++ b/src/webui/src/static/model/trialmanager.ts @@ -137,10 +137,12 @@ class TrialManager { const hpObject = JSON.parse(hyperParameters); const parameterId = hpObject["parameter_id"]; trial = { - id: `${jobInfo.id}-${parameterId}`, + // id: `${jobInfo.id}-${parameterId}`, + id: jobInfo.id, jobId: jobInfo.id, parameterId: parameterId, - sequenceId: parameterId, + // sequenceId: parameterId, + sequenceId: jobInfo.sequenceId, status: "SUCCEEDED", startTime: jobInfo.startTime, endTime: jobInfo.startTime, From 4309a46e0e46f8bae627fe328324698c0aae1c07 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 27 Jul 2020 10:24:09 +0800 Subject: [PATCH 08/30] remove useless comment --- .../src/components/public-child/OpenRow.tsx | 20 ------------------- src/webui/src/static/model/trialmanager.ts | 2 -- 2 files changed, 22 deletions(-) diff --git a/src/webui/src/components/public-child/OpenRow.tsx b/src/webui/src/components/public-child/OpenRow.tsx index 44db47fe7f..a2f0d75173 100644 --- a/src/webui/src/components/public-child/OpenRow.tsx +++ b/src/webui/src/components/public-child/OpenRow.tsx @@ -59,25 +59,6 @@ class OpenRow extends React.Component { } } - // downloadTrialLog = (): void => { - // const { trialId } = this.props; - // // download this trial's log - // const trialLogPromise = axios.get(`${MANAGER_IP}/trial-log/${trialId}/TRIAL_LOG`); - // const stderrPromise = axios.get(`${MANAGER_IP}/trial-log/${trialId}/TRIAL_STDERR`); - // trialLogPromise.then(res => { - // if (res.status === 200) { - // // start to download - // downFile(res.data, `${trialId}.log`); - // } - // }); - // stderrPromise.then(res => { - // if (res.status === 200) { - // // start to download - // downFile(res.data, `${trialId}.stderr`); - // } - // }); - // } - openTrialLog = (type: string): void => { window.open(`${MANAGER_IP}/trial-log/${this.props.trialId}/${type}`); } @@ -85,7 +66,6 @@ class OpenRow extends React.Component { render(): React.ReactNode { const { isHidenInfo, typeInfo, info } = this.state; const trialId = this.props.trialId; - console.info(trialId); // eslint-disable-line const trial = TRIALS.getTrial(trialId); const logPathRow = trial.info.logPath || 'This trial\'s log path is not available.'; return ( diff --git a/src/webui/src/static/model/trialmanager.ts b/src/webui/src/static/model/trialmanager.ts index c4ef862547..d37acd86af 100644 --- a/src/webui/src/static/model/trialmanager.ts +++ b/src/webui/src/static/model/trialmanager.ts @@ -137,11 +137,9 @@ class TrialManager { const hpObject = JSON.parse(hyperParameters); const parameterId = hpObject["parameter_id"]; trial = { - // id: `${jobInfo.id}-${parameterId}`, id: jobInfo.id, jobId: jobInfo.id, parameterId: parameterId, - // sequenceId: parameterId, sequenceId: jobInfo.sequenceId, status: "SUCCEEDED", startTime: jobInfo.startTime, From 0d7974b6a3ddfd42293a175f9178d3ed9113a4aa Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 27 Jul 2020 10:25:33 +0800 Subject: [PATCH 09/30] remove useless comment --- src/webui/src/components/public-child/OpenRow.tsx | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/webui/src/components/public-child/OpenRow.tsx b/src/webui/src/components/public-child/OpenRow.tsx index a2f0d75173..19ab6ecd13 100644 --- a/src/webui/src/components/public-child/OpenRow.tsx +++ b/src/webui/src/components/public-child/OpenRow.tsx @@ -1,10 +1,8 @@ import * as React from 'react'; -// import axios from 'axios'; import * as copy from 'copy-to-clipboard'; import { Stack, PrimaryButton, Pivot, PivotItem } from 'office-ui-fabric-react'; import { Trial } from '../../static/model/trial'; import { MANAGER_IP } from '../../static/const'; -// import { downFile } from '../../static/function'; import { EXPERIMENT, TRIALS } from '../../static/datamodel'; import JSONTree from 'react-json-tree'; import PaiTrialLog from '../public-child/PaiTrialLog'; From 7545a834b488d90be5a0e48394a4b8b2863cd3c0 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Tue, 28 Jul 2020 11:17:57 +0800 Subject: [PATCH 10/30] handle empty log --- src/nni_manager/rest_server/restHandler.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nni_manager/rest_server/restHandler.ts b/src/nni_manager/rest_server/restHandler.ts index b91fbeeb43..af44d71a01 100644 --- a/src/nni_manager/rest_server/restHandler.ts +++ b/src/nni_manager/rest_server/restHandler.ts @@ -272,6 +272,9 @@ class NNIRestHandler { private getTrialLog(router: Router): void { router.get('/trial-log/:id/:type', async(req: Request, res: Response) => { this.nniManager.getTrialLog(req.params.id, req.params.type).then((log: string) => { + if (log === '') { + log = 'No logs available.' + } res.send(log); }).catch((err: Error) => { this.handleError(err, res); From f702e97a795cc32d28d2cb46b7e251cd3d3f28f3 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Fri, 31 Jul 2020 15:26:11 +0800 Subject: [PATCH 11/30] resolve conflict --- src/nni_manager/training_service/reusable/trialDispatcher.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 59124303e7..046f389ca2 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -12,7 +12,7 @@ import * as component from '../../common/component'; import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors'; import { getBasePort, getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; -import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; +import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus, LogType } from '../../common/trainingService'; import { delay, getExperimentRootDir, getIPV4Address, getLogLevel, getVersion, mkDirPSync, uniqueString } from '../../common/utils'; import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END, VERSION_CHECK } from '../../core/commands'; import { ScheduleResultType } from '../../training_service/common/gpuData'; From 607ed216d84a487560a27ffb9bc5aaed5c91ca33 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Fri, 31 Jul 2020 15:35:14 +0800 Subject: [PATCH 12/30] more friendly MethodNotImplemented message --- src/nni_manager/core/nnimanager.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index ad243f4835..40afb7b9f8 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -326,7 +326,11 @@ class NNIManager implements Manager { } public async getTrialLog(trialJobId: string, logType: LogType): Promise { - return this.trainingService.getTrialLog(trialJobId, logType); + try { + return this.trainingService.getTrialLog(trialJobId, logType); + } catch (error) { + return Promise.reject(new Error(`Error: ${this.experimentProfile.params.trainingServicePlatform} training service does not support retriving log`)); + } } public getExperimentProfile(): Promise { From 3aa2183966542d7141f5471f1b909bedfacef10d Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Sun, 9 Aug 2020 23:28:25 +0800 Subject: [PATCH 13/30] view trial stderr -> view trial error --- src/webui/src/components/public-child/OpenRow.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webui/src/components/public-child/OpenRow.tsx b/src/webui/src/components/public-child/OpenRow.tsx index 19ab6ecd13..4f9e836ab4 100644 --- a/src/webui/src/components/public-child/OpenRow.tsx +++ b/src/webui/src/components/public-child/OpenRow.tsx @@ -122,7 +122,7 @@ class OpenRow extends React.Component { /> From 1514a1b4d9c715e27ce3cf68a0bd03cbd17deb76 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 10 Aug 2020 00:33:16 +0800 Subject: [PATCH 14/30] fix typo trialJobDeatil -> trialJobDetail --- .../training_service/local/localTrainingService.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 98ec8c10a7..dc6874c594 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -462,8 +462,8 @@ class LocalTrainingService implements TrainingService { while (!this.stopping) { while (!this.stopping && this.jobQueue.length !== 0) { const trialJobId: string = this.jobQueue[0]; - const trialJobDeatil: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId); - if (trialJobDeatil !== undefined && trialJobDeatil.status === 'WAITING') { + const trialJobDetail: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId); + if (trialJobDetail !== undefined && trialJobDetail.status === 'WAITING') { const [success, resource] = this.tryGetAvailableResource(); if (!success) { break; From 9e0dead69742b80f79f58dec3263b257ffd0101c Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 10 Aug 2020 01:43:11 +0800 Subject: [PATCH 15/30] add ut --- src/nni_manager/core/nnimanager.ts | 6 +----- .../training_service/test/localTrainingService.test.ts | 8 +++++++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index 40afb7b9f8..ad243f4835 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -326,11 +326,7 @@ class NNIManager implements Manager { } public async getTrialLog(trialJobId: string, logType: LogType): Promise { - try { - return this.trainingService.getTrialLog(trialJobId, logType); - } catch (error) { - return Promise.reject(new Error(`Error: ${this.experimentProfile.params.trainingServicePlatform} training service does not support retriving log`)); - } + return this.trainingService.getTrialLog(trialJobId, logType); } public getExperimentProfile(): Promise { diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index bc47e747ba..ab5fbb1aa3 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -55,7 +55,7 @@ describe('Unit Test for LocalTrainingService', () => { }); }); - it('Submit job and Cancel job', async () => { + it('Submit job, Get trial log and Cancel job', async () => { await localTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig); // submit job @@ -68,6 +68,12 @@ describe('Unit Test for LocalTrainingService', () => { }; const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); chai.expect(jobDetail.status).to.be.equals('WAITING'); + await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG').then((log: string) => { + chai.expect(log).to.be.a('string') + }) + await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR').then((log: string) => { + chai.expect(log).to.be.a('string') + }) await localTrainingService.cancelTrialJob(jobDetail.id); chai.expect(jobDetail.status).to.be.equals('USER_CANCELED'); }).timeout(20000); From 3453bf621ae68e0805743eb6c3eecdcf6d9720bb Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 10 Aug 2020 01:52:16 +0800 Subject: [PATCH 16/30] add ut --- .../training_service/test/localTrainingService.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index ab5fbb1aa3..60f3373873 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -68,10 +68,10 @@ describe('Unit Test for LocalTrainingService', () => { }; const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); chai.expect(jobDetail.status).to.be.equals('WAITING'); - await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG').then((log: string) => { + localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG').then((log: string) => { chai.expect(log).to.be.a('string') }) - await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR').then((log: string) => { + localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR').then((log: string) => { chai.expect(log).to.be.a('string') }) await localTrainingService.cancelTrialJob(jobDetail.id); From 2e98bd6702b817379b4345a025cc00b7559b3262 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 10 Aug 2020 01:55:18 +0800 Subject: [PATCH 17/30] add ut --- .../training_service/test/localTrainingService.test.ts | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index 60f3373873..2891f885c0 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -68,12 +68,10 @@ describe('Unit Test for LocalTrainingService', () => { }; const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); chai.expect(jobDetail.status).to.be.equals('WAITING'); - localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG').then((log: string) => { - chai.expect(log).to.be.a('string') - }) - localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR').then((log: string) => { - chai.expect(log).to.be.a('string') - }) + + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.a('string') + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.a('string') + await localTrainingService.cancelTrialJob(jobDetail.id); chai.expect(jobDetail.status).to.be.equals('USER_CANCELED'); }).timeout(20000); From cd0333c174dfce110c0237a8d1cb3af897ba78f6 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 10 Aug 2020 10:35:21 +0800 Subject: [PATCH 18/30] add ut --- .../training_service/test/localTrainingService.test.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index 2891f885c0..0288e5f3b3 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -69,6 +69,7 @@ describe('Unit Test for LocalTrainingService', () => { const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); chai.expect(jobDetail.status).to.be.equals('WAITING'); + localTrainingService.run() chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.a('string') chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.a('string') From 2f413dabcfedfd11a9cc064608e131926d6575ba Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 10 Aug 2020 10:35:49 +0800 Subject: [PATCH 19/30] add ut --- .../training_service/test/localTrainingService.test.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index 0288e5f3b3..58c56da426 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -69,9 +69,9 @@ describe('Unit Test for LocalTrainingService', () => { const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); chai.expect(jobDetail.status).to.be.equals('WAITING'); - localTrainingService.run() - chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.a('string') - chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.a('string') + localTrainingService.run(); + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.a('string'); + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.a('string'); await localTrainingService.cancelTrialJob(jobDetail.id); chai.expect(jobDetail.status).to.be.equals('USER_CANCELED'); From 92ef5c6d873e7467c3ee1d5d7f3a3830f7b86cee Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 10 Aug 2020 11:13:37 +0800 Subject: [PATCH 20/30] add ut --- .../test/localTrainingService.test.ts | 28 +++++++++++++++---- .../training_service/test/mockedTrial.py | 12 ++++++++ 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index 58c56da426..9f7fb50b6b 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -55,7 +55,7 @@ describe('Unit Test for LocalTrainingService', () => { }); }); - it('Submit job, Get trial log and Cancel job', async () => { + it('Submit job and Cancel job', async () => { await localTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig); // submit job @@ -68,13 +68,31 @@ describe('Unit Test for LocalTrainingService', () => { }; const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); chai.expect(jobDetail.status).to.be.equals('WAITING'); + await localTrainingService.cancelTrialJob(jobDetail.id); + chai.expect(jobDetail.status).to.be.equals('USER_CANCELED'); + }).timeout(20000); + + it('Get trial log', async () => { + // set meta data + const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}` + await localTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig); + + // submit job + const form: TrialJobApplicationForm = { + sequenceId: 0, + hyperParameters: { + value: 'mock hyperparameters', + index: 0 + } + }; + const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); + chai.expect(jobDetail.status).to.be.equals('WAITING'); + localTrainingService.listTrialJobs().then((jobList)=>{ + chai.expect(jobList.length).to.be.equals(1); + }); - localTrainingService.run(); chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.a('string'); chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.a('string'); - - await localTrainingService.cancelTrialJob(jobDetail.id); - chai.expect(jobDetail.status).to.be.equals('USER_CANCELED'); }).timeout(20000); it('Read metrics, Add listener, and remove listener', async () => { diff --git a/src/nni_manager/training_service/test/mockedTrial.py b/src/nni_manager/training_service/test/mockedTrial.py index 5f85934cea..ac81d09221 100644 --- a/src/nni_manager/training_service/test/mockedTrial.py +++ b/src/nni_manager/training_service/test/mockedTrial.py @@ -5,8 +5,19 @@ import time METRICS_FILENAME = '.nni/metrics' +TRIAL_LOG_FILENAME = 'trial.log' +TRIAL_STDERR_FILENAME = 'stderr' MAGIC = 'ME' +def generate_logfile(): + out_dir = os.getenv('NNI_SYS_DIR') + if not os.path.isdir(out_dir): + raise Exception('Can not find NNI_SYS_DIR: {}'.format(out_dir)) + with open(os.path.join(out_dir, TRIAL_LOG_FILENAME)) as f: + f.write('This is trial log') + with open(os.path.join(out_dir, TRIAL_STDERR_FILENAME)) as f: + f.write('This is stderr') + def sdk_send_data(data): out_dir = os.getenv('NNI_SYS_DIR') if not os.path.isdir(out_dir): @@ -21,6 +32,7 @@ def sdk_send_data(data): f.write('ME{:06d}{}'.format(datalen, wrapped_data)) def user_code(): + generate_logfile() epochs = 20 From c0ed056a39bca0baf62617b3605c05c9a5263a56 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 10 Aug 2020 13:03:49 +0800 Subject: [PATCH 21/30] add ut --- .../training_service/test/localTrainingService.test.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index 9f7fb50b6b..49acb80fd2 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -86,13 +86,14 @@ describe('Unit Test for LocalTrainingService', () => { } }; const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); - chai.expect(jobDetail.status).to.be.equals('WAITING'); - localTrainingService.listTrialJobs().then((jobList)=>{ - chai.expect(jobList.length).to.be.equals(1); - }); + chai.expect(jobDetail.status).to.be.equals('WAITING'); + await delay(1000); + chai.expect(jobDetail.status).to.be.equals('RUNNING'); chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.a('string'); chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.a('string'); + + await localTrainingService.cancelTrialJob(jobDetail.id); }).timeout(20000); it('Read metrics, Add listener, and remove listener', async () => { From 18ce44fad50d312cedd041459bffade54c8c292f Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 10 Aug 2020 13:30:35 +0800 Subject: [PATCH 22/30] add ut --- .../training_service/test/localTrainingService.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index 49acb80fd2..8cd101af44 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -89,7 +89,7 @@ describe('Unit Test for LocalTrainingService', () => { chai.expect(jobDetail.status).to.be.equals('WAITING'); await delay(1000); - chai.expect(jobDetail.status).to.be.equals('RUNNING'); + //chai.expect(jobDetail.status).to.be.equals('RUNNING'); chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.a('string'); chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.a('string'); From 73989e06459120f9e954eb3edf55898ec4ecd2ae Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Mon, 10 Aug 2020 14:03:39 +0800 Subject: [PATCH 23/30] add it --- .../training_service/test/localTrainingService.test.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index 8cd101af44..633014d454 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -88,7 +88,8 @@ describe('Unit Test for LocalTrainingService', () => { const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); chai.expect(jobDetail.status).to.be.equals('WAITING'); - await delay(1000); + localTrainingService.run() + await delay(5000); //chai.expect(jobDetail.status).to.be.equals('RUNNING'); chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.a('string'); chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.a('string'); From dfabaad2c31121ee05b053492e1e05a484b9a47d Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Tue, 11 Aug 2020 16:01:54 +0800 Subject: [PATCH 24/30] add ut --- .../test/localTrainingService.test.ts | 17 +++++++++-------- .../training_service/test/mockedTrial.py | 12 ------------ 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index 633014d454..d09446e71e 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -7,6 +7,7 @@ import * as assert from 'assert'; import * as chai from 'chai'; import * as chaiAsPromised from 'chai-as-promised'; import * as fs from 'fs'; +import * as path from 'path'; import * as tmp from 'tmp'; import * as component from '../../common/component'; import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService'; @@ -86,14 +87,14 @@ describe('Unit Test for LocalTrainingService', () => { } }; const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); - - chai.expect(jobDetail.status).to.be.equals('WAITING'); - localTrainingService.run() - await delay(5000); - //chai.expect(jobDetail.status).to.be.equals('RUNNING'); - chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.a('string'); - chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.a('string'); - + fs.mkdirSync(jobDetail.workingDirectory) + fs.writeFileSync(path.join(jobDetail.workingDirectory, 'trial.log'), 'trial log') + fs.writeFileSync(path.join(jobDetail.workingDirectory, 'stderr'), 'trial stderr') + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.equals('trial log'); + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.equals('trial stderr'); + fs.unlinkSync(path.join(jobDetail.workingDirectory, 'trial.log')) + fs.unlinkSync(path.join(jobDetail.workingDirectory, 'stderr')) + fs.rmdirSync(jobDetail.workingDirectory) await localTrainingService.cancelTrialJob(jobDetail.id); }).timeout(20000); diff --git a/src/nni_manager/training_service/test/mockedTrial.py b/src/nni_manager/training_service/test/mockedTrial.py index ac81d09221..5f85934cea 100644 --- a/src/nni_manager/training_service/test/mockedTrial.py +++ b/src/nni_manager/training_service/test/mockedTrial.py @@ -5,19 +5,8 @@ import time METRICS_FILENAME = '.nni/metrics' -TRIAL_LOG_FILENAME = 'trial.log' -TRIAL_STDERR_FILENAME = 'stderr' MAGIC = 'ME' -def generate_logfile(): - out_dir = os.getenv('NNI_SYS_DIR') - if not os.path.isdir(out_dir): - raise Exception('Can not find NNI_SYS_DIR: {}'.format(out_dir)) - with open(os.path.join(out_dir, TRIAL_LOG_FILENAME)) as f: - f.write('This is trial log') - with open(os.path.join(out_dir, TRIAL_STDERR_FILENAME)) as f: - f.write('This is stderr') - def sdk_send_data(data): out_dir = os.getenv('NNI_SYS_DIR') if not os.path.isdir(out_dir): @@ -32,7 +21,6 @@ def sdk_send_data(data): f.write('ME{:06d}{}'.format(datalen, wrapped_data)) def user_code(): - generate_logfile() epochs = 20 From 40c0fd9f19b6e2dcfe8e8e7ad2b2c78a824c9631 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Tue, 11 Aug 2020 17:20:20 +0800 Subject: [PATCH 25/30] STDERR -> TRIAL ERROR --- src/nni_manager/common/trainingService.ts | 2 +- src/nni_manager/training_service/local/localTrainingService.ts | 2 +- src/webui/src/components/public-child/OpenRow.tsx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/common/trainingService.ts b/src/nni_manager/common/trainingService.ts index d5e22a4648..4edcf16ab6 100644 --- a/src/nni_manager/common/trainingService.ts +++ b/src/nni_manager/common/trainingService.ts @@ -8,7 +8,7 @@ */ type TrialJobStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED' | 'SYS_CANCELED' | 'EARLY_STOPPED'; -type LogType = 'TRIAL_LOG' | 'TRIAL_STDERR'; +type LogType = 'TRIAL_LOG' | 'TRIAL_ERROR'; interface TrainingServiceMetadata { readonly key: string; diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index dc6874c594..a69bff8df8 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -188,7 +188,7 @@ class LocalTrainingService implements TrainingService { let logPath: string; if (logType === 'TRIAL_LOG') { logPath = path.join(this.rootDir, 'trials', trialJobId, 'trial.log'); - } else if (logType === 'TRIAL_STDERR') { + } else if (logType === 'TRIAL_ERROR') { logPath = path.join(this.rootDir, 'trials', trialJobId, 'stderr'); } else { throw new Error('unexpected log type'); diff --git a/src/webui/src/components/public-child/OpenRow.tsx b/src/webui/src/components/public-child/OpenRow.tsx index 4f9e836ab4..a20cf5313f 100644 --- a/src/webui/src/components/public-child/OpenRow.tsx +++ b/src/webui/src/components/public-child/OpenRow.tsx @@ -121,7 +121,7 @@ class OpenRow extends React.Component { text="View trial log" /> From f7ec9029ac247efaffc25fac27a4a4779e0f7e54 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Tue, 11 Aug 2020 17:40:23 +0800 Subject: [PATCH 26/30] add ut --- .../training_service/test/localTrainingService.test.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index d09446e71e..25fea4108d 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -11,7 +11,7 @@ import * as path from 'path'; import * as tmp from 'tmp'; import * as component from '../../common/component'; import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService'; -import { cleanupUnitTest, delay, prepareUnitTest } from '../../common/utils'; +import { cleanupUnitTest, delay, prepareUnitTest, getExperimentRootDir } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { LocalTrainingService } from '../local/localTrainingService'; @@ -86,12 +86,15 @@ describe('Unit Test for LocalTrainingService', () => { index: 0 } }; + let rootDir: string = getExperimentRootDir() + console.log(rootDir) const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); + fs.mkdirSync(rootDir) fs.mkdirSync(jobDetail.workingDirectory) fs.writeFileSync(path.join(jobDetail.workingDirectory, 'trial.log'), 'trial log') fs.writeFileSync(path.join(jobDetail.workingDirectory, 'stderr'), 'trial stderr') chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.equals('trial log'); - chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_STDERR')).to.be.equals('trial stderr'); + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_ERROR')).to.be.equals('trial stderr'); fs.unlinkSync(path.join(jobDetail.workingDirectory, 'trial.log')) fs.unlinkSync(path.join(jobDetail.workingDirectory, 'stderr')) fs.rmdirSync(jobDetail.workingDirectory) From a01a1e8ee5f877dbb763830b18ac34f8587dc600 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Tue, 11 Aug 2020 21:01:25 +0800 Subject: [PATCH 27/30] update ut --- .../test/localTrainingService.test.ts | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index 25fea4108d..ba320cd6f1 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -12,6 +12,7 @@ import * as tmp from 'tmp'; import * as component from '../../common/component'; import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService'; import { cleanupUnitTest, delay, prepareUnitTest, getExperimentRootDir } from '../../common/utils'; +import { getExperimentId } from '../../common/experimentStartupInfo' import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { LocalTrainingService } from '../local/localTrainingService'; @@ -86,10 +87,12 @@ describe('Unit Test for LocalTrainingService', () => { index: 0 } }; - let rootDir: string = getExperimentRootDir() - console.log(rootDir) + const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); - fs.mkdirSync(rootDir) + + // get trial log + const rootDir: string = getExperimentRootDir() + fs.mkdirSync(path.join(rootDir, 'trials')) fs.mkdirSync(jobDetail.workingDirectory) fs.writeFileSync(path.join(jobDetail.workingDirectory, 'trial.log'), 'trial log') fs.writeFileSync(path.join(jobDetail.workingDirectory, 'stderr'), 'trial stderr') @@ -98,6 +101,8 @@ describe('Unit Test for LocalTrainingService', () => { fs.unlinkSync(path.join(jobDetail.workingDirectory, 'trial.log')) fs.unlinkSync(path.join(jobDetail.workingDirectory, 'stderr')) fs.rmdirSync(jobDetail.workingDirectory) + fs.rmdirSync(path.join(rootDir, 'trials')) + await localTrainingService.cancelTrialJob(jobDetail.id); }).timeout(20000); From 5e0a5bfa9a226143079ebd5d96c0bf88715a1522 Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Wed, 12 Aug 2020 14:48:11 +0800 Subject: [PATCH 28/30] remove unused import --- .../training_service/test/localTrainingService.test.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index ba320cd6f1..85e0a7988f 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -10,9 +10,8 @@ import * as fs from 'fs'; import * as path from 'path'; import * as tmp from 'tmp'; import * as component from '../../common/component'; -import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService'; +import { TrialJobApplicationForm, TrialJobDetail} from '../../common/trainingService'; import { cleanupUnitTest, delay, prepareUnitTest, getExperimentRootDir } from '../../common/utils'; -import { getExperimentId } from '../../common/experimentStartupInfo' import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { LocalTrainingService } from '../local/localTrainingService'; From 5fbd017a01502f9cd8e22b00ded88997cc158feb Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Wed, 12 Aug 2020 14:48:53 +0800 Subject: [PATCH 29/30] remove unused import --- .../training_service/test/localTrainingService.test.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index 85e0a7988f..ef71c30f12 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -3,7 +3,6 @@ 'use strict'; -import * as assert from 'assert'; import * as chai from 'chai'; import * as chaiAsPromised from 'chai-as-promised'; import * as fs from 'fs'; From d21d524dd72e1c2f4bbabb37bc1d2b0b4a6c85cc Mon Sep 17 00:00:00 2001 From: Junwei Sun Date: Wed, 12 Aug 2020 15:11:32 +0800 Subject: [PATCH 30/30] update ut --- .../training_service/test/localTrainingService.test.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index ef71c30f12..fbaaedcd41 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -73,8 +73,6 @@ describe('Unit Test for LocalTrainingService', () => { }).timeout(20000); it('Get trial log', async () => { - // set meta data - const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}` await localTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig); // submit job