Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Reusable environment support GPU scheduler, add test cases and refactoring. #2627

Merged
merged 25 commits into from
Jul 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -222,15 +222,16 @@ function getIPV4Address(): string {
return cachedipv4Address;
}

if (os.networkInterfaces().eth0) {
for (const item of os.networkInterfaces().eth0) {
const networkInterfaces = os.networkInterfaces();
if (networkInterfaces.eth0) {
for (const item of networkInterfaces.eth0) {
if (item.family === 'IPv4') {
cachedipv4Address = item.address;
return cachedipv4Address;
}
}
} else {
throw Error('getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.');
throw Error(`getIPV4Address() failed because os.networkInterfaces().eth0 is undefined. Please specify NNI manager IP in config.`);
}

throw Error('getIPV4Address() failed because no valid IPv4 address found.')
Expand Down
1 change: 1 addition & 0 deletions src/nni_manager/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"@types/express": "^4.16.0",
"@types/glob": "^7.1.1",
"@types/js-base64": "^2.3.1",
"@types/js-yaml": "^3.12.5",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

already has this js-yaml package? refer line 72

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It uses to provide a TypeScript type declaration, so that we can import js lib like import * as yaml from 'js-yaml';, instead of const yaml = require('js-yaml');.

"@types/mocha": "^5.2.5",
"@types/node": "10.12.18",
"@types/request": "^2.47.1",
Expand Down
5 changes: 5 additions & 0 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ export namespace ValidationSchemas {
token: joi.string().min(1),
host: joi.string().min(1).required(),
reuse: joi.boolean(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(1),
maxTrialNumPerGpu: joi.number(),
useActiveGpu: joi.boolean(),
}),
kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
operator: joi.string().min(1).required(),
Expand Down
24 changes: 24 additions & 0 deletions src/nni_manager/training_service/common/gpuData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@

'use strict';

export enum ScheduleResultType {
// Schedule succeeded
SUCCEED,

// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU,

// Cannot match requirement even if all GPU are a
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a => available?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copied from previous code, should be like below,

No environment can match the hard requirement, like the GPU number is smaller than trial asked.

REQUIRE_EXCEED_TOTAL
}

/**
* GPU Infromation class
* Representing the dynamic and static information retrieved from Nvidia-smi
Expand Down Expand Up @@ -52,6 +63,19 @@ export class GPUSummary {
}
}


export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
if (gpuIndices !== undefined) {
const indices: number[] = gpuIndices.split(',')
.map((x: string) => parseInt(x, 10));
if (indices.length > 0) {
return new Set(indices);
} else {
throw new Error('gpuIndices can not be empty if specified.');
}
}
}

export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
`
$env:METRIC_OUTPUT_DIR="{0}"
Expand Down
4 changes: 4 additions & 0 deletions src/nni_manager/training_service/common/trialConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ export class TrialConfig {
// Required GPU number for trial job. The number should be in [0,100]
public readonly gpuNum: number;

// this flag uses for UT now.
// in future, all environments should be reusable, and this can be configurable by user.
public reuseEnvironment: boolean | undefined = true;

/**
* Constructor
* @param command Trail command
Expand Down
15 changes: 13 additions & 2 deletions src/nni_manager/training_service/pai/paiConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

'use strict';

import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';

export class PAIClusterConfig {
public readonly userName: string;
Expand All @@ -12,6 +12,13 @@ export class PAIClusterConfig {
public readonly token?: string;
public readonly reuse?: boolean;

public cpuNum?: number;
public memoryMB?: number;
public gpuNum?: number;

public useActiveGpu?: boolean;
public maxTrialNumPerGpu?: number;

/**
* Constructor
* @param userName User name of PAI Cluster
Expand All @@ -20,12 +27,16 @@ export class PAIClusterConfig {
* @param token PAI token of PAI Cluster
* @param reuse If job is reusable for multiple trials
*/
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean) {
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean,
cpuNum?: number, memoryMB?: number, gpuNum?: number) {
this.userName = userName;
this.passWord = passWord;
this.host = host;
this.token = token;
this.reuse = reuse;
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.gpuNum = gpuNum;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in other platforms like pai, remote, kubeflow, we added gpuNum, cpuNum setting in trialConfig, not cluster config field. I think we better to unify them.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are three things about gpuNum,

  1. Settings are here to tell OpenPAI what kind of environment is needed. It's unique to OpenPAI, not to remote, aml. I'm not sure about kueflow.
  2. Environment capacity. It's general to all environments. It doesn't need config like this, since gpu collector (nvdia-smi) dectects environment capacity in runtime.
  3. The requirement of a single trial. It's in trialConfig. With above environment capacity, we can schedule multiple trials on one environment. BTW, cpuNum and memoryMB in trial config is not used today, it may be useful in future.

}
}

Expand Down
26 changes: 12 additions & 14 deletions src/nni_manager/training_service/remote_machine/gpuScheduler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
import * as assert from 'assert';
import { getLogger, Logger } from '../../common/log';
import { randomSelect } from '../../common/utils';
import { GPUInfo } from '../common/gpuData';
import {
parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType, ExecutorManager
} from './remoteMachineData';
import { GPUInfo, parseGpuIndices, ScheduleResultType } from '../common/gpuData';
import { ExecutorManager, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail } from './remoteMachineData';

type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';

Expand Down Expand Up @@ -39,7 +37,7 @@ export class GPUScheduler {
* @param requiredGPUNum required GPU number
*/
public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
if(requiredGPUNum === undefined) {
if (requiredGPUNum === undefined) {
requiredGPUNum = 0;
}
assert(requiredGPUNum >= 0);
Expand All @@ -48,7 +46,7 @@ export class GPUScheduler {

// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta: RemoteMachineMeta) =>
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
if (eligibleRM.length === 0) {
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
Expand All @@ -75,8 +73,8 @@ export class GPUScheduler {
this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);

return {
resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU,
scheduleInfo : undefined
resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
scheduleInfo: undefined
};
}

Expand Down Expand Up @@ -159,7 +157,7 @@ export class GPUScheduler {
const num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
const maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu ? rmMeta.maxTrialNumPerGpu : 1;
if ((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) ||
(num !== undefined && num < maxTrialNumPerGpu)) {
(num !== undefined && num < maxTrialNumPerGpu)) {
availableGPUs.push(gpuInfo);
}
} else {
Expand Down Expand Up @@ -200,7 +198,7 @@ export class GPUScheduler {
}

private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
assert(gpuInfos.length >= requiredGPUNum);
const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
Expand All @@ -222,10 +220,10 @@ export class GPUScheduler {
scheduleInfo: {
rmMeta: rmMeta,
cudaVisibleDevice: allocatedGPUs
.map((gpuInfo: GPUInfo) => {
return gpuInfo.index;
})
.join(',')
.map((gpuInfo: GPUInfo) => {
return gpuInfo.index;
})
.join(',')
}
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
'use strict';

import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUInfo, GPUSummary } from '../common/gpuData';
import { GPUInfo, GPUSummary, ScheduleResultType } from '../common/gpuData';
import { ShellExecutor } from './shellExecutor';

/**
Expand All @@ -25,18 +25,6 @@ export class RemoteMachineMeta {
public readonly useActiveGpu?: boolean = false;
}

export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
if (gpuIndices !== undefined) {
const indices: number[] = gpuIndices.split(',')
.map((x: string) => parseInt(x, 10));
if (indices.length > 0) {
return new Set(indices);
} else {
throw new Error('gpuIndices can not be empty if specified.');
}
}
}

/**
* The execution result for command executed on remote machine
*/
Expand Down Expand Up @@ -168,14 +156,3 @@ export class ExecutorManager {
export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType };

export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string };

export enum ScheduleResultType {
// Schedule succeeded
SUCCEED,

// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU,

// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import * as assert from 'assert';
import { EventEmitter } from 'events';
import * as fs from 'fs';
import * as path from 'path';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
import { Deferred } from 'ts-deferred';
import * as component from '../../common/component';
import { NNIError, NNIErrorNames } from '../../common/errors';
Expand All @@ -22,18 +23,16 @@ import {
getVersion, uniqueString
} from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { GPUSummary } from '../common/gpuData';
import { GPUSummary, ScheduleResultType } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { execMkdir, validateCodeDir } from '../common/util';
import { GPUScheduler } from './gpuScheduler';
import {
RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail,
ScheduleResultType, ExecutorManager
ExecutorManager, RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail
} from './remoteMachineData';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';

/**
* Training Service implementation for Remote Machine (Linux)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

'use strict';

import { EventEmitter } from 'events';
import { delay } from "../../../common/utils";
import { AMLEnvironmentInformation } from '../aml/amlConfig';
import { CommandChannel, RunnerConnection } from "../commandChannel";
Expand All @@ -15,11 +14,7 @@ class AMLRunnerConnection extends RunnerConnection {
export class AMLCommandChannel extends CommandChannel {
private stopping: boolean = false;
private sendQueues: [EnvironmentInformation, string][] = [];
private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?<metrics>.*?)'`;

public constructor(commandEmitter: EventEmitter) {
super(commandEmitter);
}

public get channelName(): Channel {
return "aml";
}
Expand Down Expand Up @@ -99,11 +94,11 @@ export class AMLCommandChannel extends CommandChannel {
const messages = command['trial_runner'];
if (messages) {
if (messages instanceof Object && currentMessageIndex < messages.length - 1) {
for (let index = currentMessageIndex + 1; index < messages.length; index ++) {
for (let index = currentMessageIndex + 1; index < messages.length; index++) {
this.handleCommand(runnerConnection.environment, messages[index]);
}
currentMessageIndex = messages.length - 1;
} else if (currentMessageIndex === -1){
} else if (currentMessageIndex === -1) {
this.handleCommand(runnerConnection.environment, messages);
currentMessageIndex += 1;
}
Expand Down
Loading