Skip to content

Commit

Permalink
Merge pull request #264 from microsoft/master
Browse files Browse the repository at this point in the history
merge master
  • Loading branch information
SparkSnail authored Jul 31, 2020
2 parents aa64fe6 + 143c661 commit c6a5f8c
Show file tree
Hide file tree
Showing 36 changed files with 1,771 additions and 280 deletions.
4 changes: 2 additions & 2 deletions deployment/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ RUN python3 -m pip --no-cache-dir install \
numpy==1.14.3 scipy==1.1.0

#
# Tensorflow 1.10.0
# Tensorflow 1.15
#
RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.10.0
RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.15

#
# Keras 2.1.6
Expand Down
14 changes: 8 additions & 6 deletions deployment/docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@ Dockerfile
This is the Dockerfile of NNI project. It includes serveral popular deep learning frameworks and NNI. It is tested on `Ubuntu 16.04 LTS`:

```
CUDA 9.0, CuDNN 7.0
numpy 1.14.3,scipy 1.1.0
TensorFlow-gpu 1.10.0
Keras 2.1.6
PyTorch 0.4.1
CUDA 9.0
CuDNN 7.0
numpy 1.14.3
scipy 1.1.0
tensorflow-gpu 1.15.0
keras 2.1.6
torch 1.4.0
scikit-learn 0.20.0
pandas 0.23.4
lightgbm 2.2.2
NNI v0.7
nni
```
You can take this Dockerfile as a reference for your own customized Dockerfile.

Expand Down
4 changes: 2 additions & 2 deletions docs/en_US/TrainingService/AMLMode.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ Step 6. Create an AML cluster as the computeTarget.

Step 7. Open a command line and install AML package environment.
```
python3 -m pip install azureml --user
python3 -m pip install azureml-sdk --user
python3 -m pip install azureml
python3 -m pip install azureml-sdk
```

## Run an experiment
Expand Down
7 changes: 4 additions & 3 deletions src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -222,15 +222,16 @@ function getIPV4Address(): string {
return cachedipv4Address;
}

if (os.networkInterfaces().eth0) {
for (const item of os.networkInterfaces().eth0) {
const networkInterfaces = os.networkInterfaces();
if (networkInterfaces.eth0) {
for (const item of networkInterfaces.eth0) {
if (item.family === 'IPv4') {
cachedipv4Address = item.address;
return cachedipv4Address;
}
}
} else {
throw Error('getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.');
throw Error(`getIPV4Address() failed because os.networkInterfaces().eth0 is undefined. Please specify NNI manager IP in config.`);
}

throw Error('getIPV4Address() failed because no valid IPv4 address found.')
Expand Down
1 change: 1 addition & 0 deletions src/nni_manager/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"@types/express": "^4.16.0",
"@types/glob": "^7.1.1",
"@types/js-base64": "^2.3.1",
"@types/js-yaml": "^3.12.5",
"@types/mocha": "^5.2.5",
"@types/node": "10.12.18",
"@types/request": "^2.47.1",
Expand Down
5 changes: 5 additions & 0 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ export namespace ValidationSchemas {
token: joi.string().min(1),
host: joi.string().min(1).required(),
reuse: joi.boolean(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(1),
maxTrialNumPerGpu: joi.number(),
useActiveGpu: joi.boolean(),
}),
kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
operator: joi.string().min(1).required(),
Expand Down
24 changes: 24 additions & 0 deletions src/nni_manager/training_service/common/gpuData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@

'use strict';

export enum ScheduleResultType {
// Schedule succeeded
SUCCEED,

// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU,

// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}

/**
* GPU Infromation class
* Representing the dynamic and static information retrieved from Nvidia-smi
Expand Down Expand Up @@ -52,6 +63,19 @@ export class GPUSummary {
}
}


export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
if (gpuIndices !== undefined) {
const indices: number[] = gpuIndices.split(',')
.map((x: string) => parseInt(x, 10));
if (indices.length > 0) {
return new Set(indices);
} else {
throw new Error('gpuIndices can not be empty if specified.');
}
}
}

export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
`
$env:METRIC_OUTPUT_DIR="{0}"
Expand Down
4 changes: 4 additions & 0 deletions src/nni_manager/training_service/common/trialConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ export class TrialConfig {
// Required GPU number for trial job. The number should be in [0,100]
public readonly gpuNum: number;

// this flag uses for UT now.
// in future, all environments should be reusable, and this can be configurable by user.
public reuseEnvironment: boolean | undefined = true;

/**
* Constructor
* @param command Trail command
Expand Down
19 changes: 16 additions & 3 deletions src/nni_manager/training_service/pai/paiConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

'use strict';

import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';

export class PAIClusterConfig {
public readonly userName: string;
Expand All @@ -12,6 +12,13 @@ export class PAIClusterConfig {
public readonly token?: string;
public readonly reuse?: boolean;

public cpuNum?: number;
public memoryMB?: number;
public gpuNum?: number;

public useActiveGpu?: boolean;
public maxTrialNumPerGpu?: number;

/**
* Constructor
* @param userName User name of PAI Cluster
Expand All @@ -20,12 +27,16 @@ export class PAIClusterConfig {
* @param token PAI token of PAI Cluster
* @param reuse If job is reusable for multiple trials
*/
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean) {
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean,
cpuNum?: number, memoryMB?: number, gpuNum?: number) {
this.userName = userName;
this.passWord = passWord;
this.host = host;
this.token = token;
this.reuse = reuse;
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.gpuNum = gpuNum;
}
}

Expand All @@ -45,9 +56,10 @@ export class PAITrialJobDetail implements TrialJobDetail {
public form: TrialJobApplicationForm;
public logPath: string;
public isEarlyStopped?: boolean;
public paiJobDetailUrl?: string;

constructor(id: string, status: TrialJobStatus, paiJobName: string,
submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string) {
submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string, paiJobDetailUrl?: string) {
this.id = id;
this.status = status;
this.paiJobName = paiJobName;
Expand All @@ -56,5 +68,6 @@ export class PAITrialJobDetail implements TrialJobDetail {
this.form = form;
this.tags = [];
this.logPath = logPath;
this.paiJobDetailUrl = paiJobDetailUrl;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ export class PAIJobInfoCollector {
if (response.body.jobStatus.appTrackingUrl) {
paiTrialJob.url = response.body.jobStatus.appTrackingUrl;
} else {
paiTrialJob.url = paiTrialJob.logPath;
paiTrialJob.url = paiTrialJob.paiJobDetailUrl;
}
}
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,16 @@ class PAIK8STrainingService extends PAITrainingService {
const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId);
const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`;
const logPath: string = path.join(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId, trialJobId);
const paiJobDetailUrl: string = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${paiJobName}`;
const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail(
trialJobId,
'WAITING',
paiJobName,
Date.now(),
trialWorkingFolder,
form,
logPath);
logPath,
paiJobDetailUrl);

this.trialJobsMap.set(trialJobId, trialJobDetail);
this.jobQueue.push(trialJobId);
Expand Down
26 changes: 12 additions & 14 deletions src/nni_manager/training_service/remote_machine/gpuScheduler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
import * as assert from 'assert';
import { getLogger, Logger } from '../../common/log';
import { randomSelect } from '../../common/utils';
import { GPUInfo } from '../common/gpuData';
import {
parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType, ExecutorManager
} from './remoteMachineData';
import { GPUInfo, parseGpuIndices, ScheduleResultType } from '../common/gpuData';
import { ExecutorManager, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail } from './remoteMachineData';

type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';

Expand Down Expand Up @@ -39,7 +37,7 @@ export class GPUScheduler {
* @param requiredGPUNum required GPU number
*/
public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
if(requiredGPUNum === undefined) {
if (requiredGPUNum === undefined) {
requiredGPUNum = 0;
}
assert(requiredGPUNum >= 0);
Expand All @@ -48,7 +46,7 @@ export class GPUScheduler {

// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta: RemoteMachineMeta) =>
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
if (eligibleRM.length === 0) {
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
Expand All @@ -75,8 +73,8 @@ export class GPUScheduler {
this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);

return {
resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU,
scheduleInfo : undefined
resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
scheduleInfo: undefined
};
}

Expand Down Expand Up @@ -159,7 +157,7 @@ export class GPUScheduler {
const num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
const maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu ? rmMeta.maxTrialNumPerGpu : 1;
if ((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) ||
(num !== undefined && num < maxTrialNumPerGpu)) {
(num !== undefined && num < maxTrialNumPerGpu)) {
availableGPUs.push(gpuInfo);
}
} else {
Expand Down Expand Up @@ -200,7 +198,7 @@ export class GPUScheduler {
}

private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
assert(gpuInfos.length >= requiredGPUNum);
const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
Expand All @@ -222,10 +220,10 @@ export class GPUScheduler {
scheduleInfo: {
rmMeta: rmMeta,
cudaVisibleDevice: allocatedGPUs
.map((gpuInfo: GPUInfo) => {
return gpuInfo.index;
})
.join(',')
.map((gpuInfo: GPUInfo) => {
return gpuInfo.index;
})
.join(',')
}
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
'use strict';

import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUInfo, GPUSummary } from '../common/gpuData';
import { GPUInfo, GPUSummary, ScheduleResultType } from '../common/gpuData';
import { ShellExecutor } from './shellExecutor';

/**
Expand All @@ -25,18 +25,6 @@ export class RemoteMachineMeta {
public readonly useActiveGpu?: boolean = false;
}

export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
if (gpuIndices !== undefined) {
const indices: number[] = gpuIndices.split(',')
.map((x: string) => parseInt(x, 10));
if (indices.length > 0) {
return new Set(indices);
} else {
throw new Error('gpuIndices can not be empty if specified.');
}
}
}

/**
* The execution result for command executed on remote machine
*/
Expand Down Expand Up @@ -168,14 +156,3 @@ export class ExecutorManager {
export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType };

export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string };

export enum ScheduleResultType {
// Schedule succeeded
SUCCEED,

// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU,

// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import * as assert from 'assert';
import { EventEmitter } from 'events';
import * as fs from 'fs';
import * as path from 'path';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
import { Deferred } from 'ts-deferred';
import * as component from '../../common/component';
import { NNIError, NNIErrorNames } from '../../common/errors';
Expand All @@ -22,18 +23,16 @@ import {
getVersion, uniqueString
} from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { GPUSummary } from '../common/gpuData';
import { GPUSummary, ScheduleResultType } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { execMkdir, validateCodeDir } from '../common/util';
import { GPUScheduler } from './gpuScheduler';
import {
RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail,
ScheduleResultType, ExecutorManager
ExecutorManager, RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail
} from './remoteMachineData';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';

/**
* Training Service implementation for Remote Machine (Linux)
Expand Down
Loading

0 comments on commit c6a5f8c

Please sign in to comment.