Merge pull request #264 from microsoft/master

merge master
SparkSnail · Jul 31, 2020 · c6a5f8c · c6a5f8c
2 parents aa64fe6 + 143c661
commit c6a5f8c
Show file tree

Hide file tree

Showing 36 changed files with 1,771 additions and 280 deletions.
diff --git a/deployment/docker/Dockerfile b/deployment/docker/Dockerfile
@@ -44,9 +44,9 @@ RUN python3 -m pip --no-cache-dir install \
     numpy==1.14.3 scipy==1.1.0
 
 #
-# Tensorflow 1.10.0
+# Tensorflow 1.15
 #
-RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.10.0
+RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.15
 
 #
 # Keras 2.1.6

diff --git a/deployment/docker/README.md b/deployment/docker/README.md
@@ -4,15 +4,17 @@ Dockerfile
 This is the Dockerfile of NNI project. It includes serveral popular deep learning frameworks and NNI. It is tested on `Ubuntu 16.04 LTS`:
 
 ```
-CUDA 9.0, CuDNN 7.0
-numpy 1.14.3,scipy 1.1.0
-TensorFlow-gpu 1.10.0
-Keras 2.1.6
-PyTorch 0.4.1
+CUDA 9.0
+CuDNN 7.0
+numpy 1.14.3
+scipy 1.1.0
+tensorflow-gpu 1.15.0
+keras 2.1.6
+torch 1.4.0
 scikit-learn 0.20.0
 pandas 0.23.4
 lightgbm 2.2.2
-NNI v0.7
+nni
 ```
 You can take this Dockerfile as a reference for your own customized Dockerfile.
 

diff --git a/docs/en_US/TrainingService/AMLMode.md b/docs/en_US/TrainingService/AMLMode.md
@@ -22,8 +22,8 @@ Step 6. Create an AML cluster as the computeTarget.
 
 Step 7. Open a command line and install AML package environment.
 ```
-python3 -m pip install azureml --user
-python3 -m pip install azureml-sdk --user
+python3 -m pip install azureml
+python3 -m pip install azureml-sdk
 ```
 
 ## Run an experiment

diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts
@@ -222,15 +222,16 @@ function getIPV4Address(): string {
         return cachedipv4Address;
     }
 
-    if (os.networkInterfaces().eth0) {
-        for (const item of os.networkInterfaces().eth0) {
+    const networkInterfaces = os.networkInterfaces();
+    if (networkInterfaces.eth0) {
+        for (const item of networkInterfaces.eth0) {
             if (item.family === 'IPv4') {
                 cachedipv4Address = item.address;
                 return cachedipv4Address;
             }
         }
     } else {
-        throw Error('getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.');
+        throw Error(`getIPV4Address() failed because os.networkInterfaces().eth0 is undefined. Please specify NNI manager IP in config.`);
     }
 
     throw Error('getIPV4Address() failed because no valid IPv4 address found.')

diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json
@@ -39,6 +39,7 @@
     "@types/express": "^4.16.0",
     "@types/glob": "^7.1.1",
     "@types/js-base64": "^2.3.1",
+    "@types/js-yaml": "^3.12.5",
     "@types/mocha": "^5.2.5",
     "@types/node": "10.12.18",
     "@types/request": "^2.47.1",

diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts
@@ -107,6 +107,11 @@ export namespace ValidationSchemas {
                 token: joi.string().min(1),
                 host: joi.string().min(1).required(),
                 reuse: joi.boolean(),
+                cpuNum: joi.number().min(1),
+                memoryMB: joi.number().min(100),
+                gpuNum: joi.number().min(1),
+                maxTrialNumPerGpu: joi.number(),
+                useActiveGpu: joi.boolean(),
             }),
             kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
                 operator: joi.string().min(1).required(),

diff --git a/src/nni_manager/training_service/common/gpuData.ts b/src/nni_manager/training_service/common/gpuData.ts
@@ -3,6 +3,17 @@
 
 'use strict';
 
+export enum ScheduleResultType {
+    // Schedule succeeded
+    SUCCEED,
+
+    // Temporarily, no enough available GPU right now
+    TMP_NO_AVAILABLE_GPU,
+
+    // Cannot match requirement even if all GPU are a
+    REQUIRE_EXCEED_TOTAL
+}
+
 /**
  * GPU Infromation class
  * Representing the dynamic and static information retrieved from Nvidia-smi
@@ -52,6 +63,19 @@ export class GPUSummary {
     }
 }
 
+
+export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
+    if (gpuIndices !== undefined) {
+        const indices: number[] = gpuIndices.split(',')
+            .map((x: string) => parseInt(x, 10));
+        if (indices.length > 0) {
+            return new Set(indices);
+        } else {
+            throw new Error('gpuIndices can not be empty if specified.');
+        }
+    }
+}
+
 export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
     `
 $env:METRIC_OUTPUT_DIR="{0}"

diff --git a/src/nni_manager/training_service/common/trialConfig.ts b/src/nni_manager/training_service/common/trialConfig.ts
@@ -17,6 +17,10 @@ export class TrialConfig {
     // Required GPU number for trial job. The number should be in [0,100]
     public readonly gpuNum: number;
 
+    // this flag uses for UT now.
+    // in future, all environments should be reusable, and this can be configurable by user.
+    public reuseEnvironment: boolean | undefined = true;
+
     /**
      * Constructor
      * @param command Trail command

diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts
@@ -3,7 +3,7 @@
 
 'use strict';
 
-import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus  } from '../../common/trainingService';
+import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
 
 export class PAIClusterConfig {
     public readonly userName: string;
@@ -12,6 +12,13 @@ export class PAIClusterConfig {
     public readonly token?: string;
     public readonly reuse?: boolean;
 
+    public cpuNum?: number;
+    public memoryMB?: number;
+    public gpuNum?: number;
+
+    public useActiveGpu?: boolean;
+    public maxTrialNumPerGpu?: number;
+
     /**
      * Constructor
      * @param userName User name of PAI Cluster
@@ -20,12 +27,16 @@ export class PAIClusterConfig {
      * @param token PAI token of PAI Cluster
      * @param reuse If job is reusable for multiple trials
      */
-    constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean) {
+    constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean,
+        cpuNum?: number, memoryMB?: number, gpuNum?: number) {
         this.userName = userName;
         this.passWord = passWord;
         this.host = host;
         this.token = token;
         this.reuse = reuse;
+        this.cpuNum = cpuNum;
+        this.memoryMB = memoryMB;
+        this.gpuNum = gpuNum;
     }
 }
 
@@ -45,9 +56,10 @@ export class PAITrialJobDetail implements TrialJobDetail {
     public form: TrialJobApplicationForm;
     public logPath: string;
     public isEarlyStopped?: boolean;
+    public paiJobDetailUrl?: string;
 
     constructor(id: string, status: TrialJobStatus, paiJobName: string,
-                submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string) {
+                submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string, paiJobDetailUrl?: string) {
         this.id = id;
         this.status = status;
         this.paiJobName = paiJobName;
@@ -56,5 +68,6 @@ export class PAITrialJobDetail implements TrialJobDetail {
         this.form = form;
         this.tags = [];
         this.logPath = logPath;
+        this.paiJobDetailUrl = paiJobDetailUrl;
     }
 }
diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts
@@ -84,7 +84,7 @@ export class PAIJobInfoCollector {
                                 if (response.body.jobStatus.appTrackingUrl) {
                                     paiTrialJob.url = response.body.jobStatus.appTrackingUrl;
                                 } else {
-                                    paiTrialJob.url = paiTrialJob.logPath;
+                                    paiTrialJob.url = paiTrialJob.paiJobDetailUrl;
                                 }
                             }
                             break;

diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
@@ -124,14 +124,16 @@ class PAIK8STrainingService extends PAITrainingService {
         const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId);
         const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`;
         const logPath: string = path.join(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId, trialJobId);
+        const paiJobDetailUrl: string = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${paiJobName}`;
         const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail(
             trialJobId,
             'WAITING',
             paiJobName,
             Date.now(),
             trialWorkingFolder,
             form,
-            logPath);
+            logPath,
+            paiJobDetailUrl);
 
         this.trialJobsMap.set(trialJobId, trialJobDetail);
         this.jobQueue.push(trialJobId);

diff --git a/src/nni_manager/training_service/remote_machine/gpuScheduler.ts b/src/nni_manager/training_service/remote_machine/gpuScheduler.ts
@@ -6,10 +6,8 @@
 import * as assert from 'assert';
 import { getLogger, Logger } from '../../common/log';
 import { randomSelect } from '../../common/utils';
-import { GPUInfo } from '../common/gpuData';
-import {
-    parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType, ExecutorManager
-} from './remoteMachineData';
+import { GPUInfo, parseGpuIndices, ScheduleResultType } from '../common/gpuData';
+import { ExecutorManager, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail } from './remoteMachineData';
 
 type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';
 
@@ -39,7 +37,7 @@ export class GPUScheduler {
      * @param requiredGPUNum required GPU number
      */
     public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
-        if(requiredGPUNum === undefined) {
+        if (requiredGPUNum === undefined) {
             requiredGPUNum = 0;
         }
         assert(requiredGPUNum >= 0);
@@ -48,7 +46,7 @@ export class GPUScheduler {
 
         // Step 1: Check if required GPU number not exceeds the total GPU number in all machines
         const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta: RemoteMachineMeta) =>
-                 rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
+            rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
         if (eligibleRM.length === 0) {
             // If the required gpu number exceeds the upper limit of all machine's GPU number
             // Return REQUIRE_EXCEED_TOTAL directly
@@ -75,8 +73,8 @@ export class GPUScheduler {
         this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);
 
         return {
-            resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU,
-            scheduleInfo : undefined
+            resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
+            scheduleInfo: undefined
         };
     }
 
@@ -159,7 +157,7 @@ export class GPUScheduler {
                             const num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
                             const maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu ? rmMeta.maxTrialNumPerGpu : 1;
                             if ((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) ||
-                               (num !== undefined && num < maxTrialNumPerGpu)) {
+                                (num !== undefined && num < maxTrialNumPerGpu)) {
                                 availableGPUs.push(gpuInfo);
                             }
                         } else {
@@ -200,7 +198,7 @@ export class GPUScheduler {
     }
 
     private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
-                         gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
+        gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
         assert(gpuInfos.length >= requiredGPUNum);
         const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
         allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
@@ -222,10 +220,10 @@ export class GPUScheduler {
             scheduleInfo: {
                 rmMeta: rmMeta,
                 cudaVisibleDevice: allocatedGPUs
-                                    .map((gpuInfo: GPUInfo) => {
-                                        return gpuInfo.index;
-                                    })
-                                    .join(',')
+                    .map((gpuInfo: GPUInfo) => {
+                        return gpuInfo.index;
+                    })
+                    .join(',')
             }
         };
     }

diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts
@@ -4,7 +4,7 @@
 'use strict';
 
 import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
-import { GPUInfo, GPUSummary } from '../common/gpuData';
+import { GPUInfo, GPUSummary, ScheduleResultType } from '../common/gpuData';
 import { ShellExecutor } from './shellExecutor';
 
 /**
@@ -25,18 +25,6 @@ export class RemoteMachineMeta {
     public readonly useActiveGpu?: boolean = false;
 }
 
-export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
-    if (gpuIndices !== undefined) {
-        const indices: number[] = gpuIndices.split(',')
-            .map((x: string) => parseInt(x, 10));
-        if (indices.length > 0) {
-            return new Set(indices);
-        } else {
-            throw new Error('gpuIndices can not be empty if specified.');
-        }
-    }
-}
-
 /**
  * The execution result for command executed on remote machine
  */
@@ -168,14 +156,3 @@ export class ExecutorManager {
 export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType };
 
 export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string };
-
-export enum ScheduleResultType {
-    // Schedule succeeded
-    SUCCEED,
-
-    // Temporarily, no enough available GPU right now
-    TMP_NO_AVAILABLE_GPU,
-
-    // Cannot match requirement even if all GPU are a
-    REQUIRE_EXCEED_TOTAL
-}
diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
@@ -7,6 +7,7 @@ import * as assert from 'assert';
 import { EventEmitter } from 'events';
 import * as fs from 'fs';
 import * as path from 'path';
+import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
 import { Deferred } from 'ts-deferred';
 import * as component from '../../common/component';
 import { NNIError, NNIErrorNames } from '../../common/errors';
@@ -22,18 +23,16 @@ import {
     getVersion, uniqueString
 } from '../../common/utils';
 import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
-import { GPUSummary } from '../common/gpuData';
+import { GPUSummary, ScheduleResultType } from '../common/gpuData';
 import { TrialConfig } from '../common/trialConfig';
 import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
 import { execMkdir, validateCodeDir } from '../common/util';
 import { GPUScheduler } from './gpuScheduler';
 import {
-    RemoteMachineMeta,
-    RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail,
-    ScheduleResultType, ExecutorManager
+    ExecutorManager, RemoteMachineMeta,
+    RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail
 } from './remoteMachineData';
 import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
-import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
 
 /**
  * Training Service implementation for Remote Machine (Linux)