Skip to content

Commit

Permalink
adaptive support
Browse files Browse the repository at this point in the history
  • Loading branch information
pengwu22 committed Oct 12, 2020
1 parent adb51fa commit 449bff6
Show file tree
Hide file tree
Showing 10 changed files with 19 additions and 4 deletions.
1 change: 1 addition & 0 deletions examples/trials/cifar-adaptdl/config_adl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ trial:
image: registry.petuum.com/dev/nni:tmp-nni-cifar-1.8
imagePullSecrets:
- name: stagingsecret
adaptive: true
checkpoint:
storageClass: dfs
storageSize: 1Gi
Expand Down
1 change: 1 addition & 0 deletions examples/trials/lr-adaptdl/config_adl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ trial:
command: python3 /root/apps/lr-adaptdl/linear_regression.py
codeDir: .
gpuNum: 1
adaptive: true
image: registry.petuum.com/dev/nni:tmp-test
imagePullSecrets:
- name: stagingsecret
Expand Down
1 change: 1 addition & 0 deletions src/nni_manager/config/adl/adaptdljob-template.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
}
},
"spec": {
"preemptible": false,
"template": {
"spec": {
"containers": [
Expand Down
2 changes: 2 additions & 0 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ export namespace ValidationSchemas {
imagePullSecrets: joi.array({
name: joi.string().min(1).required()
}),
// ############## adl ###############
adaptive: joi.boolean(),
checkpoint: joi.object({
storageClass: joi.string().min(1).required(),
storageSize: joi.string().min(1).required()
Expand Down
7 changes: 6 additions & 1 deletion src/nni_manager/training_service/kubernetes/adl/adlConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,15 @@ export class AdlTrialConfig {

public readonly memorySize?: string;

public readonly adaptive?: boolean; // adaptive == preemptible

constructor(codeDir: string,
command: string, gpuNum: number,
image: string, imagePullSecrets?: ImagePullSecretConfig[],
nfs?: NFSConfig, checkpoint?: CheckpointConfig,
cpuNum?: number, memorySize?: string) {
cpuNum?: number, memorySize?: string,
adaptive?: boolean
) {
this.codeDir = codeDir;
this.command = command;
this.gpuNum = gpuNum;
Expand All @@ -81,6 +85,7 @@ export class AdlTrialConfig {
this.checkpoint = checkpoint;
this.cpuNum = cpuNum;
this.memorySize = memorySize;
this.adaptive = adaptive;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ export class AdlJobInfoCollector extends KubernetesJobInfoCollector {
case 'Running':
case 'Stopping':
kubernetesTrialJob.status = 'RUNNING';
kubernetesTrialJob.message = undefined; //TODO(Petuum)
kubernetesTrialJob.message = `Use 'nnictl log trial --trial_id ${kubernetesTrialJob.id}' to check the log stream.`;
if (kubernetesTrialJob.startTime === undefined) {
kubernetesTrialJob.startTime = Date.parse(<string>kubernetesJobInfo.metadata.creationTimestamp);
}
Expand All @@ -74,7 +74,7 @@ export class AdlJobInfoCollector extends KubernetesJobInfoCollector {
kubernetesTrialJob.message = kubernetesJobInfo.status.message;
if (kubernetesPodsInfo.items.length > 0) {
kubernetesTrialJob.message += " ; ";
kubernetesTrialJob.message += `Use 'nnictl logs --trial_id ${kubernetesTrialJob.id}' to get pod failure log.`;
kubernetesTrialJob.message += `Use 'nnictl log trial --trial_id ${kubernetesTrialJob.id}' for the path of the collected logs.`;
}
// undefined => NaN as endTime here
kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTimestamp);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ class AdlTrainingService extends KubernetesTrainingService implements Kubernetes
job.metadata.labels.app = this.NNI_KUBERNETES_TRIAL_LABEL
job.metadata.labels.expId = this.experimentId
job.metadata.labels.trialId = trialJobId
if (this.adlTrialConfig.adaptive !== undefined){
job.spec.preemptible = this.adlTrialConfig.adaptive
}
job.spec.template.spec.containers[0]
.image = this.adlTrialConfig.image;
job.spec.template.spec.volumes[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
kubernetesJobName: string, url: string) {
this.id = id;
this.status = status;
this.message = 'Creating the trial job.';
this.message = 'Pending for creating the trial job.';
this.submitTime = submitTime;
this.workingDirectory = workingDirectory;
this.form = form;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ describe('Unit Test for AdlTrainingService', () => {
"name": "stagingsecrets"
}
],
"adaptive": true,
"checkpoint": {
"storageClass": "aws-efs",
"storageSize": "1Gi"
Expand Down
1 change: 1 addition & 0 deletions tools/nni_cmd/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ def validate(self, data):
'path': setType('path', str),
'containerMountPath': setType('containerMountPath', str)
},
Optional('adaptive'): setType('adaptive', bool),
Optional('checkpoint'): {
'storageClass': setType('storageClass', str),
'storageSize': setType('storageSize', str)
Expand Down

0 comments on commit 449bff6

Please sign in to comment.