Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge master #264

Merged
merged 5 commits into from
Jul 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions deployment/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ RUN python3 -m pip --no-cache-dir install \
numpy==1.14.3 scipy==1.1.0

#
# Tensorflow 1.10.0
# Tensorflow 1.15
#
RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.10.0
RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.15

#
# Keras 2.1.6
Expand Down
14 changes: 8 additions & 6 deletions deployment/docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@ Dockerfile
This is the Dockerfile of NNI project. It includes serveral popular deep learning frameworks and NNI. It is tested on `Ubuntu 16.04 LTS`:

```
CUDA 9.0, CuDNN 7.0
numpy 1.14.3,scipy 1.1.0
TensorFlow-gpu 1.10.0
Keras 2.1.6
PyTorch 0.4.1
CUDA 9.0
CuDNN 7.0
numpy 1.14.3
scipy 1.1.0
tensorflow-gpu 1.15.0
keras 2.1.6
torch 1.4.0
scikit-learn 0.20.0
pandas 0.23.4
lightgbm 2.2.2
NNI v0.7
nni
```
You can take this Dockerfile as a reference for your own customized Dockerfile.

Expand Down
4 changes: 2 additions & 2 deletions docs/en_US/TrainingService/AMLMode.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ Step 6. Create an AML cluster as the computeTarget.

Step 7. Open a command line and install AML package environment.
```
python3 -m pip install azureml --user
python3 -m pip install azureml-sdk --user
python3 -m pip install azureml
python3 -m pip install azureml-sdk
```

## Run an experiment
Expand Down
7 changes: 4 additions & 3 deletions src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -222,15 +222,16 @@ function getIPV4Address(): string {
return cachedipv4Address;
}

if (os.networkInterfaces().eth0) {
for (const item of os.networkInterfaces().eth0) {
const networkInterfaces = os.networkInterfaces();
if (networkInterfaces.eth0) {
for (const item of networkInterfaces.eth0) {
if (item.family === 'IPv4') {
cachedipv4Address = item.address;
return cachedipv4Address;
}
}
} else {
throw Error('getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.');
throw Error(`getIPV4Address() failed because os.networkInterfaces().eth0 is undefined. Please specify NNI manager IP in config.`);
}

throw Error('getIPV4Address() failed because no valid IPv4 address found.')
Expand Down
1 change: 1 addition & 0 deletions src/nni_manager/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"@types/express": "^4.16.0",
"@types/glob": "^7.1.1",
"@types/js-base64": "^2.3.1",
"@types/js-yaml": "^3.12.5",
"@types/mocha": "^5.2.5",
"@types/node": "10.12.18",
"@types/request": "^2.47.1",
Expand Down
5 changes: 5 additions & 0 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ export namespace ValidationSchemas {
token: joi.string().min(1),
host: joi.string().min(1).required(),
reuse: joi.boolean(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(1),
maxTrialNumPerGpu: joi.number(),
useActiveGpu: joi.boolean(),
}),
kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
operator: joi.string().min(1).required(),
Expand Down
24 changes: 24 additions & 0 deletions src/nni_manager/training_service/common/gpuData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@

'use strict';

export enum ScheduleResultType {
// Schedule succeeded
SUCCEED,

// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU,

// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}

/**
* GPU Infromation class
* Representing the dynamic and static information retrieved from Nvidia-smi
Expand Down Expand Up @@ -52,6 +63,19 @@ export class GPUSummary {
}
}


export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
if (gpuIndices !== undefined) {
const indices: number[] = gpuIndices.split(',')
.map((x: string) => parseInt(x, 10));
if (indices.length > 0) {
return new Set(indices);
} else {
throw new Error('gpuIndices can not be empty if specified.');
}
}
}

export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
`
$env:METRIC_OUTPUT_DIR="{0}"
Expand Down
4 changes: 4 additions & 0 deletions src/nni_manager/training_service/common/trialConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ export class TrialConfig {
// Required GPU number for trial job. The number should be in [0,100]
public readonly gpuNum: number;

// this flag uses for UT now.
// in future, all environments should be reusable, and this can be configurable by user.
public reuseEnvironment: boolean | undefined = true;

/**
* Constructor
* @param command Trail command
Expand Down
19 changes: 16 additions & 3 deletions src/nni_manager/training_service/pai/paiConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

'use strict';

import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';

export class PAIClusterConfig {
public readonly userName: string;
Expand All @@ -12,6 +12,13 @@ export class PAIClusterConfig {
public readonly token?: string;
public readonly reuse?: boolean;

public cpuNum?: number;
public memoryMB?: number;
public gpuNum?: number;

public useActiveGpu?: boolean;
public maxTrialNumPerGpu?: number;

/**
* Constructor
* @param userName User name of PAI Cluster
Expand All @@ -20,12 +27,16 @@ export class PAIClusterConfig {
* @param token PAI token of PAI Cluster
* @param reuse If job is reusable for multiple trials
*/
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean) {
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean,
cpuNum?: number, memoryMB?: number, gpuNum?: number) {
this.userName = userName;
this.passWord = passWord;
this.host = host;
this.token = token;
this.reuse = reuse;
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.gpuNum = gpuNum;
}
}

Expand All @@ -45,9 +56,10 @@ export class PAITrialJobDetail implements TrialJobDetail {
public form: TrialJobApplicationForm;
public logPath: string;
public isEarlyStopped?: boolean;
public paiJobDetailUrl?: string;

constructor(id: string, status: TrialJobStatus, paiJobName: string,
submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string) {
submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string, paiJobDetailUrl?: string) {
this.id = id;
this.status = status;
this.paiJobName = paiJobName;
Expand All @@ -56,5 +68,6 @@ export class PAITrialJobDetail implements TrialJobDetail {
this.form = form;
this.tags = [];
this.logPath = logPath;
this.paiJobDetailUrl = paiJobDetailUrl;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ export class PAIJobInfoCollector {
if (response.body.jobStatus.appTrackingUrl) {
paiTrialJob.url = response.body.jobStatus.appTrackingUrl;
} else {
paiTrialJob.url = paiTrialJob.logPath;
paiTrialJob.url = paiTrialJob.paiJobDetailUrl;
}
}
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,16 @@ class PAIK8STrainingService extends PAITrainingService {
const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId);
const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`;
const logPath: string = path.join(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId, trialJobId);
const paiJobDetailUrl: string = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${paiJobName}`;
const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail(
trialJobId,
'WAITING',
paiJobName,
Date.now(),
trialWorkingFolder,
form,
logPath);
logPath,
paiJobDetailUrl);

this.trialJobsMap.set(trialJobId, trialJobDetail);
this.jobQueue.push(trialJobId);
Expand Down
26 changes: 12 additions & 14 deletions src/nni_manager/training_service/remote_machine/gpuScheduler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
import * as assert from 'assert';
import { getLogger, Logger } from '../../common/log';
import { randomSelect } from '../../common/utils';
import { GPUInfo } from '../common/gpuData';
import {
parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType, ExecutorManager
} from './remoteMachineData';
import { GPUInfo, parseGpuIndices, ScheduleResultType } from '../common/gpuData';
import { ExecutorManager, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail } from './remoteMachineData';

type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';

Expand Down Expand Up @@ -39,7 +37,7 @@ export class GPUScheduler {
* @param requiredGPUNum required GPU number
*/
public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
if(requiredGPUNum === undefined) {
if (requiredGPUNum === undefined) {
requiredGPUNum = 0;
}
assert(requiredGPUNum >= 0);
Expand All @@ -48,7 +46,7 @@ export class GPUScheduler {

// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta: RemoteMachineMeta) =>
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
if (eligibleRM.length === 0) {
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
Expand All @@ -75,8 +73,8 @@ export class GPUScheduler {
this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);

return {
resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU,
scheduleInfo : undefined
resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
scheduleInfo: undefined
};
}

Expand Down Expand Up @@ -159,7 +157,7 @@ export class GPUScheduler {
const num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
const maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu ? rmMeta.maxTrialNumPerGpu : 1;
if ((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) ||
(num !== undefined && num < maxTrialNumPerGpu)) {
(num !== undefined && num < maxTrialNumPerGpu)) {
availableGPUs.push(gpuInfo);
}
} else {
Expand Down Expand Up @@ -200,7 +198,7 @@ export class GPUScheduler {
}

private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
assert(gpuInfos.length >= requiredGPUNum);
const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
Expand All @@ -222,10 +220,10 @@ export class GPUScheduler {
scheduleInfo: {
rmMeta: rmMeta,
cudaVisibleDevice: allocatedGPUs
.map((gpuInfo: GPUInfo) => {
return gpuInfo.index;
})
.join(',')
.map((gpuInfo: GPUInfo) => {
return gpuInfo.index;
})
.join(',')
}
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
'use strict';

import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUInfo, GPUSummary } from '../common/gpuData';
import { GPUInfo, GPUSummary, ScheduleResultType } from '../common/gpuData';
import { ShellExecutor } from './shellExecutor';

/**
Expand All @@ -25,18 +25,6 @@ export class RemoteMachineMeta {
public readonly useActiveGpu?: boolean = false;
}

export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
if (gpuIndices !== undefined) {
const indices: number[] = gpuIndices.split(',')
.map((x: string) => parseInt(x, 10));
if (indices.length > 0) {
return new Set(indices);
} else {
throw new Error('gpuIndices can not be empty if specified.');
}
}
}

/**
* The execution result for command executed on remote machine
*/
Expand Down Expand Up @@ -168,14 +156,3 @@ export class ExecutorManager {
export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType };

export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string };

export enum ScheduleResultType {
// Schedule succeeded
SUCCEED,

// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU,

// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import * as assert from 'assert';
import { EventEmitter } from 'events';
import * as fs from 'fs';
import * as path from 'path';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
import { Deferred } from 'ts-deferred';
import * as component from '../../common/component';
import { NNIError, NNIErrorNames } from '../../common/errors';
Expand All @@ -22,18 +23,16 @@ import {
getVersion, uniqueString
} from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { GPUSummary } from '../common/gpuData';
import { GPUSummary, ScheduleResultType } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { execMkdir, validateCodeDir } from '../common/util';
import { GPUScheduler } from './gpuScheduler';
import {
RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail,
ScheduleResultType, ExecutorManager
ExecutorManager, RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail
} from './remoteMachineData';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';

/**
* Training Service implementation for Remote Machine (Linux)
Expand Down
Loading