microsoft · SparkSnail · Jul 30, 2020 · Jul 1, 2020 · Jul 1, 2020 · Jul 1, 2020
diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts
@@ -222,15 +222,16 @@ function getIPV4Address(): string {
         return cachedipv4Address;
     }
 
-    if (os.networkInterfaces().eth0) {
-        for (const item of os.networkInterfaces().eth0) {
+    const networkInterfaces = os.networkInterfaces();
+    if (networkInterfaces.eth0) {
+        for (const item of networkInterfaces.eth0) {
             if (item.family === 'IPv4') {
                 cachedipv4Address = item.address;
                 return cachedipv4Address;
             }
         }
     } else {
-        throw Error('getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.');
+        throw Error(`getIPV4Address() failed because os.networkInterfaces().eth0 is undefined. Please specify NNI manager IP in config.`);
     }
 
     throw Error('getIPV4Address() failed because no valid IPv4 address found.')

diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json
@@ -39,6 +39,7 @@
     "@types/express": "^4.16.0",
     "@types/glob": "^7.1.1",
     "@types/js-base64": "^2.3.1",
+    "@types/js-yaml": "^3.12.5",
     "@types/mocha": "^5.2.5",
     "@types/node": "10.12.18",
     "@types/request": "^2.47.1",

diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts
@@ -107,6 +107,11 @@ export namespace ValidationSchemas {
                 token: joi.string().min(1),
                 host: joi.string().min(1).required(),
                 reuse: joi.boolean(),
+                cpuNum: joi.number().min(1),
+                memoryMB: joi.number().min(100),
+                gpuNum: joi.number().min(1),
+                maxTrialNumPerGpu: joi.number(),
+                useActiveGpu: joi.boolean(),
             }),
             kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
                 operator: joi.string().min(1).required(),

diff --git a/src/nni_manager/training_service/common/gpuData.ts b/src/nni_manager/training_service/common/gpuData.ts
@@ -3,6 +3,17 @@
 
 'use strict';
 
+export enum ScheduleResultType {
+    // Schedule succeeded
+    SUCCEED,
+
+    // Temporarily, no enough available GPU right now
+    TMP_NO_AVAILABLE_GPU,
+
+    // Cannot match requirement even if all GPU are a
+    REQUIRE_EXCEED_TOTAL
+}
+
 /**
  * GPU Infromation class
  * Representing the dynamic and static information retrieved from Nvidia-smi
@@ -52,6 +63,19 @@ export class GPUSummary {
     }
 }
 
+
+export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
+    if (gpuIndices !== undefined) {
+        const indices: number[] = gpuIndices.split(',')
+            .map((x: string) => parseInt(x, 10));
+        if (indices.length > 0) {
+            return new Set(indices);
+        } else {
+            throw new Error('gpuIndices can not be empty if specified.');
+        }
+    }
+}
+
 export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
     `
 $env:METRIC_OUTPUT_DIR="{0}"

diff --git a/src/nni_manager/training_service/common/trialConfig.ts b/src/nni_manager/training_service/common/trialConfig.ts
@@ -17,6 +17,10 @@ export class TrialConfig {
     // Required GPU number for trial job. The number should be in [0,100]
     public readonly gpuNum: number;
 
+    // this flag uses for UT now.
+    // in future, all environments should be reusable, and this can be configurable by user.
+    public reuseEnvironment: boolean | undefined = true;
+
     /**
      * Constructor
      * @param command Trail command

diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts
@@ -3,7 +3,7 @@
 
 'use strict';
 
-import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus  } from '../../common/trainingService';
+import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
 
 export class PAIClusterConfig {
     public readonly userName: string;
@@ -12,6 +12,13 @@ export class PAIClusterConfig {
     public readonly token?: string;
     public readonly reuse?: boolean;
 
+    public cpuNum?: number;
+    public memoryMB?: number;
+    public gpuNum?: number;
+
+    public useActiveGpu?: boolean;
+    public maxTrialNumPerGpu?: number;
+
     /**
      * Constructor
      * @param userName User name of PAI Cluster
@@ -20,12 +27,16 @@ export class PAIClusterConfig {
      * @param token PAI token of PAI Cluster
      * @param reuse If job is reusable for multiple trials
      */
-    constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean) {
+    constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean,
+        cpuNum?: number, memoryMB?: number, gpuNum?: number) {
         this.userName = userName;
         this.passWord = passWord;
         this.host = host;
         this.token = token;
         this.reuse = reuse;
+        this.cpuNum = cpuNum;
+        this.memoryMB = memoryMB;
+        this.gpuNum = gpuNum;
     }
 }
 

diff --git a/src/nni_manager/training_service/remote_machine/gpuScheduler.ts b/src/nni_manager/training_service/remote_machine/gpuScheduler.ts
@@ -6,10 +6,8 @@
 import * as assert from 'assert';
 import { getLogger, Logger } from '../../common/log';
 import { randomSelect } from '../../common/utils';
-import { GPUInfo } from '../common/gpuData';
-import {
-    parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType, ExecutorManager
-} from './remoteMachineData';
+import { GPUInfo, parseGpuIndices, ScheduleResultType } from '../common/gpuData';
+import { ExecutorManager, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail } from './remoteMachineData';
 
 type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';
 
@@ -39,7 +37,7 @@ export class GPUScheduler {
      * @param requiredGPUNum required GPU number
      */
     public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
-        if(requiredGPUNum === undefined) {
+        if (requiredGPUNum === undefined) {
             requiredGPUNum = 0;
         }
         assert(requiredGPUNum >= 0);
@@ -48,7 +46,7 @@ export class GPUScheduler {
 
         // Step 1: Check if required GPU number not exceeds the total GPU number in all machines
         const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta: RemoteMachineMeta) =>
-                 rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
+            rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
         if (eligibleRM.length === 0) {
             // If the required gpu number exceeds the upper limit of all machine's GPU number
             // Return REQUIRE_EXCEED_TOTAL directly
@@ -75,8 +73,8 @@ export class GPUScheduler {
         this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);
 
         return {
-            resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU,
-            scheduleInfo : undefined
+            resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
+            scheduleInfo: undefined
         };
     }
 
@@ -159,7 +157,7 @@ export class GPUScheduler {
                             const num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
                             const maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu ? rmMeta.maxTrialNumPerGpu : 1;
                             if ((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) ||
-                               (num !== undefined && num < maxTrialNumPerGpu)) {
+                                (num !== undefined && num < maxTrialNumPerGpu)) {
                                 availableGPUs.push(gpuInfo);
                             }
                         } else {
@@ -200,7 +198,7 @@ export class GPUScheduler {
     }
 
     private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
-                         gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
+        gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
         assert(gpuInfos.length >= requiredGPUNum);
         const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
         allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
@@ -222,10 +220,10 @@ export class GPUScheduler {
             scheduleInfo: {
                 rmMeta: rmMeta,
                 cudaVisibleDevice: allocatedGPUs
-                                    .map((gpuInfo: GPUInfo) => {
-                                        return gpuInfo.index;
-                                    })
-                                    .join(',')
+                    .map((gpuInfo: GPUInfo) => {
+                        return gpuInfo.index;
+                    })
+                    .join(',')
             }
         };
     }

diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts
@@ -4,7 +4,7 @@
 'use strict';
 
 import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
-import { GPUInfo, GPUSummary } from '../common/gpuData';
+import { GPUInfo, GPUSummary, ScheduleResultType } from '../common/gpuData';
 import { ShellExecutor } from './shellExecutor';
 
 /**
@@ -25,18 +25,6 @@ export class RemoteMachineMeta {
     public readonly useActiveGpu?: boolean = false;
 }
 
-export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
-    if (gpuIndices !== undefined) {
-        const indices: number[] = gpuIndices.split(',')
-            .map((x: string) => parseInt(x, 10));
-        if (indices.length > 0) {
-            return new Set(indices);
-        } else {
-            throw new Error('gpuIndices can not be empty if specified.');
-        }
-    }
-}
-
 /**
  * The execution result for command executed on remote machine
  */
@@ -168,14 +156,3 @@ export class ExecutorManager {
 export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType };
 
 export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string };
-
-export enum ScheduleResultType {
-    // Schedule succeeded
-    SUCCEED,
-
-    // Temporarily, no enough available GPU right now
-    TMP_NO_AVAILABLE_GPU,
-
-    // Cannot match requirement even if all GPU are a
-    REQUIRE_EXCEED_TOTAL
-}
diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
@@ -7,6 +7,7 @@ import * as assert from 'assert';
 import { EventEmitter } from 'events';
 import * as fs from 'fs';
 import * as path from 'path';
+import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
 import { Deferred } from 'ts-deferred';
 import * as component from '../../common/component';
 import { NNIError, NNIErrorNames } from '../../common/errors';
@@ -22,18 +23,16 @@ import {
     getVersion, uniqueString
 } from '../../common/utils';
 import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
-import { GPUSummary } from '../common/gpuData';
+import { GPUSummary, ScheduleResultType } from '../common/gpuData';
 import { TrialConfig } from '../common/trialConfig';
 import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
 import { execMkdir, validateCodeDir } from '../common/util';
 import { GPUScheduler } from './gpuScheduler';
 import {
-    RemoteMachineMeta,
-    RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail,
-    ScheduleResultType, ExecutorManager
+    ExecutorManager, RemoteMachineMeta,
+    RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail
 } from './remoteMachineData';
 import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
-import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
 
 /**
  * Training Service implementation for Remote Machine (Linux)

diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts
@@ -3,7 +3,6 @@
 
 'use strict';
 
-import { EventEmitter } from 'events';
 import { delay } from "../../../common/utils";
 import { AMLEnvironmentInformation } from '../aml/amlConfig';
 import { CommandChannel, RunnerConnection } from "../commandChannel";
@@ -15,11 +14,7 @@ class AMLRunnerConnection extends RunnerConnection {
 export class AMLCommandChannel extends CommandChannel {
     private stopping: boolean = false;
     private sendQueues: [EnvironmentInformation, string][] = [];
-    private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?<metrics>.*?)'`;
-
-    public constructor(commandEmitter: EventEmitter) {
-        super(commandEmitter);
-    }
+
     public get channelName(): Channel {
         return "aml";
     }
@@ -99,11 +94,11 @@ export class AMLCommandChannel extends CommandChannel {
                     const messages = command['trial_runner'];
                     if (messages) {
                         if (messages instanceof Object && currentMessageIndex < messages.length - 1) {
-                            for (let index = currentMessageIndex + 1; index < messages.length; index ++) {
+                            for (let index = currentMessageIndex + 1; index < messages.length; index++) {
                                 this.handleCommand(runnerConnection.environment, messages[index]);
                             }
                             currentMessageIndex = messages.length - 1;
-                        } else if (currentMessageIndex === -1){
+                        } else if (currentMessageIndex === -1) {
                             this.handleCommand(runnerConnection.environment, messages);
                             currentMessageIndex += 1;
                         }