From 04deb54d5f9e41a242860149c7de5bc54384d55c Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Mon, 27 Jul 2020 15:37:46 +0800 Subject: [PATCH] Upgrade pai restful API (#2722) --- azure-pipelines.yml | 2 +- .../rest_server/restValidationSchemas.ts | 1 - .../pai/paiJobInfoCollector.ts | 7 +- .../pai/paiK8S/paiK8STrainingService.ts | 11 +-- .../pai/paiTrainingService.ts | 4 +- .../environments/openPaiEnvironmentService.ts | 96 +++---------------- tools/nni_cmd/config_schema.py | 9 +- 7 files changed, 26 insertions(+), 104 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 010c5c4806..839aa2c4d6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -106,8 +106,8 @@ jobs: steps: - script: | + echo "##vso[task.setvariable variable=PATH]/usr/local/Cellar/python@3.7/3.7.8_1/bin:${HOME}/Library/Python/3.7/bin:${PATH}" python3 -m pip install --upgrade pip setuptools - echo "##vso[task.setvariable variable=PATH]${HOME}/Library/Python/3.7/bin:${PATH}" displayName: 'Install python tools' - script: | echo "network-timeout 600000" >> ${HOME}/.yarnrc diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 302073707e..78fa1017fe 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -103,7 +103,6 @@ export namespace ValidationSchemas { }), pai_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase userName: joi.string().min(1).required(), - passWord: joi.string().min(1), token: joi.string().min(1), host: joi.string().min(1).required(), reuse: joi.boolean(), diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts index eb15765a4f..221d84b724 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -52,7 +52,7 @@ export class PAIJobInfoCollector { // Rest call to get PAI job info and update status // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API const getJobInfoRequest: request.Options = { - uri: `${protocol}://${paiClusterConfig.host}/rest-server/api/v1/user/${paiClusterConfig.userName}/jobs/${paiTrialJob.paiJobName}`, + uri: `${protocol}://${paiClusterConfig.host}/rest-server/api/v2/jobs/${paiClusterConfig.userName}~${paiTrialJob.paiJobName}`, method: 'GET', json: true, headers: { @@ -63,8 +63,9 @@ export class PAIJobInfoCollector { //TODO : pass in request timeout param? request(getJobInfoRequest, (error: Error, response: request.Response, _body: any) => { - if ((error !== undefined && error !== null) || response.statusCode >= 500) { - this.log.error(`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!`); + // Status code 200 for success + if ((error !== undefined && error !== null) || response.statusCode >= 400) { + // The job refresh time could be ealier than job submission, so it might return 404 error code, need refactor // Queried PAI job info failed, set job status to UNKNOWN if (paiTrialJob.status === 'WAITING' || paiTrialJob.status === 'RUNNING') { paiTrialJob.status = 'UNKNOWN'; diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index e243387d39..54623dc2b6 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -55,12 +55,7 @@ class PAIK8STrainingService extends PAITrainingService { this.paiJobRestServer = new PAIJobRestServer(component.get(PAIK8STrainingService)); this.paiClusterConfig = JSON.parse(value); this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host); - if (this.paiClusterConfig.passWord) { - // Get PAI authentication token - await this.updatePaiToken(); - } else if (this.paiClusterConfig.token) { - this.paiToken = this.paiClusterConfig.token; - } + this.paiToken = this.paiClusterConfig.token; break; case TrialConfigMetadataKey.TRIAL_CONFIG: { @@ -283,18 +278,20 @@ class PAIK8STrainingService extends PAITrainingService { uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, method: 'POST', body: paiJobConfig, + followAllRedirects: true, headers: { 'Content-Type': 'text/yaml', Authorization: `Bearer ${this.paiToken}` } }; request(submitJobRequest, (error: Error, response: request.Response, body: any) => { + // If submit success, will get status code 202. refer: https://github.com/microsoft/pai/blob/master/src/rest-server/docs/swagger.yaml if ((error !== undefined && error !== null) || response.statusCode >= 400) { const errorMessage: string = (error !== undefined && error !== null) ? error.message : `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${body}`; - this.log.error(errorMessage); trialJobDetail.status = 'FAILED'; + deferred.reject(errorMessage); } else { trialJobDetail.submitTime = Date.now(); } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index e26c16ecee..aff583de54 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -162,8 +162,7 @@ abstract class PAITrainingService implements TrainingService { } const stopJobRequest: request.Options = { - uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\ -/jobs/${trialJobDetail.paiJobName}/executionType`, + uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs/${this.paiClusterConfig.userName}~${trialJobDetail.paiJobName}/executionType`, method: 'PUT', json: true, body: { value: 'STOP' }, @@ -178,6 +177,7 @@ abstract class PAITrainingService implements TrainingService { const deferred: Deferred = new Deferred(); request(stopJobRequest, (error: Error, response: request.Response, _body: any) => { + // Status code 202 for success. if ((error !== undefined && error !== null) || response.statusCode >= 400) { this.log.error(`PAI Training service: stop trial ${trialJobId} to PAI Cluster failed!`); deferred.reject((error !== undefined && error !== null) ? error.message : diff --git a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index 1a527b26f9..79c2e7da98 100644 --- a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -28,15 +28,12 @@ export class OpenPaiEnvironmentService extends EnvironmentService { private paiTrialConfig: NNIPAIK8STrialConfig | undefined; private paiJobConfig: any; private paiToken?: string; - private paiTokenUpdateTime?: number; - private readonly paiTokenUpdateInterval: number; private protocol: string = 'http'; private experimentId: string; constructor() { super(); - this.paiTokenUpdateInterval = 7200000; //2hours this.experimentId = getExperimentId(); } @@ -49,12 +46,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService { case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: this.paiClusterConfig = JSON.parse(value); this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host); - if (this.paiClusterConfig.passWord) { - // Get PAI authentication token - await this.updatePaiToken(); - } else if (this.paiClusterConfig.token) { - this.paiToken = this.paiClusterConfig.token; - } + this.paiToken = this.paiClusterConfig.token; break; case TrialConfigMetadataKey.TRIAL_CONFIG: { @@ -81,7 +73,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { const deferred: Deferred = new Deferred(); - await this.refreshPlatform(); if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); @@ -101,9 +92,12 @@ export class OpenPaiEnvironmentService extends EnvironmentService { }; request(getJobInfoRequest, async (error: any, response: request.Response, body: any) => { + // Status code 200 for success if ((error !== undefined && error !== null) || response.statusCode >= 400) { - this.log.error(`OpenPAI: get environment list from PAI Cluster failed!\nerror: ${error}`); - deferred.reject(error); + const errorMessage: string = (error !== undefined && error !== null) ? error.message : + `OpenPAI: get environment list from PAI Cluster failed!, http code:${response.statusCode}, http body: ${JSON.stringify(body)}`; + this.log.error(`${errorMessage}`); + deferred.reject(errorMessage); } else { const jobInfos = new Map(); body.forEach((jobInfo: any) => { @@ -157,8 +151,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { public async startEnvironment(environment: EnvironmentInformation): Promise { const deferred: Deferred = new Deferred(); - await this.refreshPlatform(); - if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } @@ -184,18 +176,21 @@ export class OpenPaiEnvironmentService extends EnvironmentService { uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, method: 'POST', body: paiJobConfig, + followAllRedirects: true, headers: { 'Content-Type': 'text/yaml', Authorization: `Bearer ${this.paiToken}` } }; request(submitJobRequest, (error, response, body) => { + // Status code 202 for success, refer https://github.com/microsoft/pai/blob/master/src/rest-server/docs/swagger.yaml if ((error !== undefined && error !== null) || response.statusCode >= 400) { const errorMessage: string = (error !== undefined && error !== null) ? error.message : `start environment ${environment.jobId} failed, http code:${response.statusCode}, http body: ${body}`; this.log.error(errorMessage); environment.status = 'FAILED'; + deferred.reject(errorMessage); } deferred.resolve(); }); @@ -230,8 +225,11 @@ export class OpenPaiEnvironmentService extends EnvironmentService { try { request(stopJobRequest, (error, response, _body) => { try { + // Status code 202 for success. if ((error !== undefined && error !== null) || (response && response.statusCode >= 400)) { - this.log.error(`OpenPAI: stop job ${environment.jobId} failed with ${response.statusCode}\n${error}`); + const errorMessage: string = (error !== undefined && error !== null) ? error.message : + `OpenPAI: stop job ${environment.jobId} failed, http code:${response.statusCode}, http body: ${_body}`; + this.log.error(`${errorMessage}`); deferred.reject((error !== undefined && error !== null) ? error : `Stop trial failed, http code: ${response.statusCode}`); } else { @@ -251,19 +249,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { return deferred.promise; } - private async refreshPlatform(): Promise { - if (this.paiClusterConfig && this.paiClusterConfig.passWord) { - try { - await this.updatePaiToken(); - } catch (error) { - this.log.error(`${error}`); - if (this.paiToken === undefined) { - throw new Error(error); - } - } - } - } - private generateJobConfigInYamlFormat(environment: EnvironmentInformation): any { if (this.paiTrialConfig === undefined) { throw new Error('trial config is not initialized'); @@ -363,59 +348,4 @@ export class OpenPaiEnvironmentService extends EnvironmentService { return host; } } - /** - * Update pai token by the interval time or initialize the pai token - */ - protected async updatePaiToken(): Promise { - const deferred: Deferred = new Deferred(); - - const currentTime: number = new Date().getTime(); - //If pai token initialized and not reach the interval time, do not update - if (this.paiTokenUpdateTime !== undefined && (currentTime - this.paiTokenUpdateTime) < this.paiTokenUpdateInterval) { - return Promise.resolve(); - } - - if (this.paiClusterConfig === undefined) { - const paiClusterConfigError: string = `pai cluster config not initialized!`; - this.log.error(`${paiClusterConfigError}`); - throw Error(`${paiClusterConfigError}`); - } - - const authenticationReq: request.Options = { - uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/token`, - method: 'POST', - json: true, - body: { - username: this.paiClusterConfig.userName, - password: this.paiClusterConfig.passWord - } - }; - - request(authenticationReq, (error: any, response: request.Response, body: any) => { - if (error !== undefined && error !== null) { - this.log.error(`Get PAI token failed: ${error.message}, authenticationReq: ${authenticationReq}`); - deferred.reject(new Error(`Get PAI token failed: ${error.message}`)); - } else { - if (response.statusCode !== 200) { - this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}, authenticationReq: ${authenticationReq}`); - deferred.reject(new Error(`Get PAI token failed code: ${response.statusCode}, body: ${response.body}, authenticationReq: ${authenticationReq}, please check paiConfig username or password`)); - } else { - this.paiToken = body.token; - this.paiTokenUpdateTime = new Date().getTime(); - deferred.resolve(); - } - } - }); - - let timeoutId: NodeJS.Timer; - const timeoutDelay: Promise = new Promise((_resolve: Function, reject: Function): void => { - // Set timeout and reject the promise once reach timeout (5 seconds) - timeoutId = setTimeout( - () => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')), - 5000); - }); - - return Promise.race([timeoutDelay, deferred.promise]) - .finally(() => { clearTimeout(timeoutId); }); - } } diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index b8bed07fb9..8d30562e94 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -200,17 +200,12 @@ def validate(self, data): } pai_config_schema = { - 'paiConfig': Or({ - 'userName': setType('userName', str), - 'passWord': setType('passWord', str), - 'host': setType('host', str), - Optional('reuse'): setType('reuse', bool) - }, { + 'paiConfig': { 'userName': setType('userName', str), 'token': setType('token', str), 'host': setType('host', str), Optional('reuse'): setType('reuse', bool) - }) + } } dlts_trial_schema = {