-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Reusable environment support GPU scheduler, add test cases and refactoring. #2627
Changes from all commits
ecb360f
c8f82f9
53556d7
e2fae9b
a96dd7c
1cdb6e8
b410cba
0b540eb
e84b205
f6381c3
0e8c494
e13d3e3
e27a791
98647b4
bf8a1a2
ae598bf
1f26855
250d6e5
20b23d6
8990b4d
946d052
68ef948
a4441c0
86db9a3
16f4690
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,17 @@ | |
|
||
'use strict'; | ||
|
||
export enum ScheduleResultType { | ||
// Schedule succeeded | ||
SUCCEED, | ||
|
||
// Temporarily, no enough available GPU right now | ||
TMP_NO_AVAILABLE_GPU, | ||
|
||
// Cannot match requirement even if all GPU are a | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a => available? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Copied from previous code, should be like below, No environment can match the hard requirement, like the GPU number is smaller than trial asked. |
||
REQUIRE_EXCEED_TOTAL | ||
} | ||
|
||
/** | ||
* GPU Infromation class | ||
* Representing the dynamic and static information retrieved from Nvidia-smi | ||
|
@@ -52,6 +63,19 @@ export class GPUSummary { | |
} | ||
} | ||
|
||
|
||
export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined { | ||
if (gpuIndices !== undefined) { | ||
const indices: number[] = gpuIndices.split(',') | ||
.map((x: string) => parseInt(x, 10)); | ||
if (indices.length > 0) { | ||
return new Set(indices); | ||
} else { | ||
throw new Error('gpuIndices can not be empty if specified.'); | ||
} | ||
} | ||
} | ||
|
||
export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string = | ||
` | ||
$env:METRIC_OUTPUT_DIR="{0}" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
|
||
'use strict'; | ||
|
||
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; | ||
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; | ||
|
||
export class PAIClusterConfig { | ||
public readonly userName: string; | ||
|
@@ -12,6 +12,13 @@ export class PAIClusterConfig { | |
public readonly token?: string; | ||
public readonly reuse?: boolean; | ||
|
||
public cpuNum?: number; | ||
public memoryMB?: number; | ||
public gpuNum?: number; | ||
|
||
public useActiveGpu?: boolean; | ||
public maxTrialNumPerGpu?: number; | ||
|
||
/** | ||
* Constructor | ||
* @param userName User name of PAI Cluster | ||
|
@@ -20,12 +27,16 @@ export class PAIClusterConfig { | |
* @param token PAI token of PAI Cluster | ||
* @param reuse If job is reusable for multiple trials | ||
*/ | ||
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean) { | ||
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean, | ||
cpuNum?: number, memoryMB?: number, gpuNum?: number) { | ||
this.userName = userName; | ||
this.passWord = passWord; | ||
this.host = host; | ||
this.token = token; | ||
this.reuse = reuse; | ||
this.cpuNum = cpuNum; | ||
this.memoryMB = memoryMB; | ||
this.gpuNum = gpuNum; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in other platforms like pai, remote, kubeflow, we added gpuNum, cpuNum setting in trialConfig, not cluster config field. I think we better to unify them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are three things about gpuNum,
|
||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
already has this
js-yaml
package? refer line 72There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It uses to provide a TypeScript type declaration, so that we can import js lib like
import * as yaml from 'js-yaml';
, instead ofconst yaml = require('js-yaml');
.