-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Upgrade pai restful API #2722
Upgrade pai restful API #2722
Changes from 19 commits
dcd2ffd
3b8b6fb
916e444
caeffb8
57c300e
65660e6
9376d6a
5fef3cf
5544ae8
a16db11
f9fdfee
c5e26ef
10a04ba
590f373
8d7c6cc
c9855ca
2cbe75a
8a3ea9c
a1c2b22
e21e231
f7cd10e
3da5864
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,7 +52,7 @@ export class PAIJobInfoCollector { | |
// Rest call to get PAI job info and update status | ||
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API | ||
const getJobInfoRequest: request.Options = { | ||
uri: `${protocol}://${paiClusterConfig.host}/rest-server/api/v1/user/${paiClusterConfig.userName}/jobs/${paiTrialJob.paiJobName}`, | ||
uri: `${protocol}://${paiClusterConfig.host}/rest-server/api/v2/jobs/${paiClusterConfig.userName}~${paiTrialJob.paiJobName}`, | ||
method: 'GET', | ||
json: true, | ||
headers: { | ||
|
@@ -63,8 +63,11 @@ export class PAIJobInfoCollector { | |
|
||
//TODO : pass in request timeout param? | ||
request(getJobInfoRequest, (error: Error, response: request.Response, _body: any) => { | ||
if ((error !== undefined && error !== null) || response.statusCode >= 500) { | ||
this.log.error(`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!`); | ||
// Status code 200 for success | ||
if ((error !== undefined && error !== null) || response.statusCode >= 400) { | ||
const errorMessage: string = (error !== undefined && error !== null) ? error.message : | ||
`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!, http code:${response.statusCode}, http body: ${JSON.stringify(_body)}`; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've tested the response data of PAI, seems this list API in v2 returns a object, and need to use JSON.stringify to decode the content, while the submit API returns a string directly, does not need to do this. |
||
this.log.error(`${errorMessage}`); | ||
// Queried PAI job info failed, set job status to UNKNOWN | ||
if (paiTrialJob.status === 'WAITING' || paiTrialJob.status === 'RUNNING') { | ||
paiTrialJob.status = 'UNKNOWN'; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,15 +28,12 @@ export class OpenPaiEnvironmentService extends EnvironmentService { | |
private paiTrialConfig: NNIPAIK8STrialConfig | undefined; | ||
private paiJobConfig: any; | ||
private paiToken?: string; | ||
private paiTokenUpdateTime?: number; | ||
private readonly paiTokenUpdateInterval: number; | ||
private protocol: string = 'http'; | ||
|
||
private experimentId: string; | ||
|
||
constructor() { | ||
super(); | ||
this.paiTokenUpdateInterval = 7200000; //2hours | ||
this.experimentId = getExperimentId(); | ||
} | ||
|
||
|
@@ -49,12 +46,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService { | |
case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: | ||
this.paiClusterConfig = <PAIClusterConfig>JSON.parse(value); | ||
this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host); | ||
if (this.paiClusterConfig.passWord) { | ||
// Get PAI authentication token | ||
await this.updatePaiToken(); | ||
} else if (this.paiClusterConfig.token) { | ||
this.paiToken = this.paiClusterConfig.token; | ||
} | ||
this.paiToken = this.paiClusterConfig.token; | ||
break; | ||
|
||
case TrialConfigMetadataKey.TRIAL_CONFIG: { | ||
|
@@ -81,7 +73,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { | |
|
||
public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> { | ||
const deferred: Deferred<void> = new Deferred<void>(); | ||
await this.refreshPlatform(); | ||
|
||
if (this.paiClusterConfig === undefined) { | ||
throw new Error('PAI Cluster config is not initialized'); | ||
|
@@ -101,9 +92,12 @@ export class OpenPaiEnvironmentService extends EnvironmentService { | |
}; | ||
|
||
request(getJobInfoRequest, async (error: any, response: request.Response, body: any) => { | ||
// Status code 200 for success | ||
if ((error !== undefined && error !== null) || response.statusCode >= 400) { | ||
this.log.error(`OpenPAI: get environment list from PAI Cluster failed!\nerror: ${error}`); | ||
deferred.reject(error); | ||
const errorMessage: string = (error !== undefined && error !== null) ? error.message : | ||
`OpenPAI: get environment list from PAI Cluster failed!, http code:${response.statusCode}, http body: ${JSON.stringify(body)}`; | ||
this.log.error(`${errorMessage}`); | ||
deferred.reject(errorMessage); | ||
} else { | ||
const jobInfos = new Map<string, any>(); | ||
body.forEach((jobInfo: any) => { | ||
|
@@ -154,8 +148,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { | |
public async startEnvironment(environment: EnvironmentInformation): Promise<void> { | ||
const deferred: Deferred<void> = new Deferred<void>(); | ||
|
||
await this.refreshPlatform(); | ||
|
||
if (this.paiClusterConfig === undefined) { | ||
throw new Error('PAI Cluster config is not initialized'); | ||
} | ||
|
@@ -181,18 +173,21 @@ export class OpenPaiEnvironmentService extends EnvironmentService { | |
uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, | ||
method: 'POST', | ||
body: paiJobConfig, | ||
followAllRedirects: true, | ||
headers: { | ||
'Content-Type': 'text/yaml', | ||
Authorization: `Bearer ${this.paiToken}` | ||
} | ||
}; | ||
request(submitJobRequest, (error, response, body) => { | ||
// Status code 202 for success, refer https://github.com/microsoft/pai/blob/master/src/rest-server/docs/swagger.yaml | ||
if ((error !== undefined && error !== null) || response.statusCode >= 400) { | ||
const errorMessage: string = (error !== undefined && error !== null) ? error.message : | ||
`start environment ${environment.jobId} failed, http code:${response.statusCode}, http body: ${body}`; | ||
|
||
this.log.error(errorMessage); | ||
environment.status = 'FAILED'; | ||
deferred.reject(errorMessage); | ||
} | ||
deferred.resolve(); | ||
}); | ||
|
@@ -227,8 +222,11 @@ export class OpenPaiEnvironmentService extends EnvironmentService { | |
try { | ||
request(stopJobRequest, (error, response, _body) => { | ||
try { | ||
if ((error !== undefined && error !== null) || (response && response.statusCode >= 400)) { | ||
this.log.error(`OpenPAI: stop job ${environment.jobId} failed with ${response.statusCode}\n${error}`); | ||
// Status code 202 for success. | ||
if ((error !== undefined && error !== null) || (response && response.statusCode !== 202)) { | ||
const errorMessage: string = (error !== undefined && error !== null) ? error.message : | ||
`OpenPAI: stop job ${environment.jobId} failed, http code:${response.statusCode}, http body: ${_body}`; | ||
this.log.error(`${errorMessage}`); | ||
deferred.reject((error !== undefined && error !== null) ? error : | ||
`Stop trial failed, http code: ${response.statusCode}`); | ||
} else { | ||
|
@@ -248,19 +246,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { | |
return deferred.promise; | ||
} | ||
|
||
private async refreshPlatform(): Promise<void> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why remove the refresh logic? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's ok to remove, as it only refresh pai token. And it's a pattern for future platform, which need regular refresh credential. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @squirrelsc you mean we will add dedicated token refresh logic in the future? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean, if there are similar requirements, here is a reference. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. then, why remove pai token refresh? no need to refresh token any more on pai? |
||
if (this.paiClusterConfig && this.paiClusterConfig.passWord) { | ||
try { | ||
await this.updatePaiToken(); | ||
} catch (error) { | ||
this.log.error(`${error}`); | ||
if (this.paiToken === undefined) { | ||
throw new Error(error); | ||
} | ||
} | ||
} | ||
} | ||
|
||
private generateJobConfigInYamlFormat(environment: EnvironmentInformation): any { | ||
if (this.paiTrialConfig === undefined) { | ||
throw new Error('trial config is not initialized'); | ||
|
@@ -360,59 +345,4 @@ export class OpenPaiEnvironmentService extends EnvironmentService { | |
return host; | ||
} | ||
} | ||
/** | ||
* Update pai token by the interval time or initialize the pai token | ||
*/ | ||
protected async updatePaiToken(): Promise<void> { | ||
const deferred: Deferred<void> = new Deferred<void>(); | ||
|
||
const currentTime: number = new Date().getTime(); | ||
//If pai token initialized and not reach the interval time, do not update | ||
if (this.paiTokenUpdateTime !== undefined && (currentTime - this.paiTokenUpdateTime) < this.paiTokenUpdateInterval) { | ||
return Promise.resolve(); | ||
} | ||
|
||
if (this.paiClusterConfig === undefined) { | ||
const paiClusterConfigError: string = `pai cluster config not initialized!`; | ||
this.log.error(`${paiClusterConfigError}`); | ||
throw Error(`${paiClusterConfigError}`); | ||
} | ||
|
||
const authenticationReq: request.Options = { | ||
uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/token`, | ||
method: 'POST', | ||
json: true, | ||
body: { | ||
username: this.paiClusterConfig.userName, | ||
password: this.paiClusterConfig.passWord | ||
} | ||
}; | ||
|
||
request(authenticationReq, (error: any, response: request.Response, body: any) => { | ||
if (error !== undefined && error !== null) { | ||
this.log.error(`Get PAI token failed: ${error.message}, authenticationReq: ${authenticationReq}`); | ||
deferred.reject(new Error(`Get PAI token failed: ${error.message}`)); | ||
} else { | ||
if (response.statusCode !== 200) { | ||
this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}, authenticationReq: ${authenticationReq}`); | ||
deferred.reject(new Error(`Get PAI token failed code: ${response.statusCode}, body: ${response.body}, authenticationReq: ${authenticationReq}, please check paiConfig username or password`)); | ||
} else { | ||
this.paiToken = body.token; | ||
this.paiTokenUpdateTime = new Date().getTime(); | ||
deferred.resolve(); | ||
} | ||
} | ||
}); | ||
|
||
let timeoutId: NodeJS.Timer; | ||
const timeoutDelay: Promise<void> = new Promise<void>((_resolve: Function, reject: Function): void => { | ||
// Set timeout and reject the promise once reach timeout (5 seconds) | ||
timeoutId = setTimeout( | ||
() => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')), | ||
5000); | ||
}); | ||
|
||
return Promise.race([timeoutDelay, deferred.promise]) | ||
.finally(() => { clearTimeout(timeoutId); }); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please also update doc accordingly
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the PAIMode doc has already removed the password configuration, while it keeps in our code. Only need to remove code this time.