diff --git a/ts/nni_manager/training_service/reusable/environment.ts b/ts/nni_manager/training_service/reusable/environment.ts index 3f021676db..124a7a0cd0 100644 --- a/ts/nni_manager/training_service/reusable/environment.ts +++ b/ts/nni_manager/training_service/reusable/environment.ts @@ -51,6 +51,8 @@ export class EnvironmentInformation { // uses to count how many trial runs on this environment. // it can be used in many scenarios, but for now, it uses for reusable. public assignedTrialCount: number = 0; + // it is used to get environment idle time interval + public latestTrialReleasedTime: number = -1; // NNI environment ID public id: string; diff --git a/ts/nni_manager/training_service/reusable/gpuScheduler.ts b/ts/nni_manager/training_service/reusable/gpuScheduler.ts index 84f8ca4234..86f590746d 100644 --- a/ts/nni_manager/training_service/reusable/gpuScheduler.ts +++ b/ts/nni_manager/training_service/reusable/gpuScheduler.ts @@ -10,7 +10,7 @@ import { GPUInfo, ScheduleResultType } from '../common/gpuData'; import { EnvironmentInformation } from './environment'; import { TrialDetail } from './trial'; -type SCHEDULE_POLICY_NAME = 'random' | 'round-robin'; +type SCHEDULE_POLICY_NAME = 'random' | 'round-robin' | 'recently-idle'; export class GpuSchedulerSetting { public useActiveGpu: boolean = false; @@ -30,7 +30,7 @@ export class GpuScheduler { // private readonly machineExecutorMap: Set; private readonly log: Logger = getLogger(); - private readonly policyName: SCHEDULE_POLICY_NAME = 'round-robin'; + private readonly policyName: SCHEDULE_POLICY_NAME = 'recently-idle'; private defaultSetting: GpuSchedulerSetting; private roundRobinIndex: number = 0; @@ -101,6 +101,7 @@ export class GpuScheduler { trial.environment.defaultGpuSummary !== undefined && trial.assignedGpus !== undefined && trial.assignedGpus.length > 0) { + for (const gpuInfo of trial.assignedGpus) { const defaultGpuSummary = trial.environment.defaultGpuSummary; const num: number | undefined = defaultGpuSummary.assignedGpuIndexMap.get(gpuInfo.index); @@ -190,10 +191,30 @@ export class GpuScheduler { return randomSelect(qualifiedEnvironments); } else if (this.policyName === 'round-robin') { return this.roundRobinSelect(qualifiedEnvironments, allEnvironments); + } else if (this.policyName === 'recently-idle') { + return this.recentlyIdleSelect(qualifiedEnvironments, allEnvironments); } else { throw new Error(`Unsupported schedule policy: ${this.policyName}`); } } + + // Select the environment which is idle most recently. If all environments are not idle, use round robin to select an environment. + private recentlyIdleSelect(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation { + const now = Date.now(); + let selectedEnvironment: EnvironmentInformation | undefined = undefined; + let minTimeInterval = Number.MAX_SAFE_INTEGER; + for (const environment of qualifiedEnvironments) { + if (environment.latestTrialReleasedTime > 0 && (now - environment.latestTrialReleasedTime) < minTimeInterval) { + selectedEnvironment = environment; + minTimeInterval = now - environment.latestTrialReleasedTime; + } + } + if (selectedEnvironment === undefined) { + return this.roundRobinSelect(qualifiedEnvironments, allEnvironments); + } + selectedEnvironment.latestTrialReleasedTime = -1; + return selectedEnvironment; + } private roundRobinSelect(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation { while (!qualifiedEnvironments.includes(allEnvironments[this.roundRobinIndex % allEnvironments.length])) { diff --git a/ts/nni_manager/training_service/reusable/trialDispatcher.ts b/ts/nni_manager/training_service/reusable/trialDispatcher.ts index 00934200b9..5085cd1fc3 100644 --- a/ts/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/ts/nni_manager/training_service/reusable/trialDispatcher.ts @@ -732,6 +732,7 @@ class TrialDispatcher implements TrainingService { throw new Error(`TrialDispatcher: environment ${trial.environment.id} has no counted running trial!`); } trial.environment.runningTrialCount--; + trial.environment.latestTrialReleasedTime = Date.now(); trial.environment = undefined; } if (true === this.enableGpuScheduler) {