Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Report the error more gracefully when nvidia-smi not exist #1418

Merged
merged 9 commits into from
Aug 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/nni_manager/training_service/local/gpuScheduler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ class GPUScheduler {
} catch (error) {
this.log.error('Read GPU summary failed with error: ', error);
}
if (this.gpuSummary !== undefined && this.gpuSummary.gpuCount === 0) {
throw new Error('GPU not available. Please check your CUDA configuration');
}
await delay(5000);
}
}
Expand Down
42 changes: 21 additions & 21 deletions src/nni_manager/training_service/local/localTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ class LocalTrainingService implements TrainingService {
private readonly occupiedGpuIndexNumMap: Map<number, number>;
private designatedGpuIndices!: Set<number>;
private readonly log: Logger;
private localTrailConfig?: TrialConfig;
private localTrialConfig?: TrialConfig;
private localConfig?: LocalConfig;
private isMultiPhase: boolean;
private readonly jobStreamMap: Map<string, ts.Stream>;
Expand Down Expand Up @@ -204,7 +204,7 @@ class LocalTrainingService implements TrainingService {
} catch (error) {
//ignore
}
this.log.debug(`trailJob status update: ${trialJobId}, ${trialJob.status}`);
this.log.debug(`trialJob status update: ${trialJobId}, ${trialJob.status}`);
}
}

Expand Down Expand Up @@ -302,14 +302,14 @@ class LocalTrainingService implements TrainingService {
}
switch (key) {
case TrialConfigMetadataKey.TRIAL_CONFIG:
this.localTrailConfig = <TrialConfig>JSON.parse(value);
this.localTrialConfig = <TrialConfig>JSON.parse(value);
// Parse trial config failed, throw Error
if (this.localTrailConfig === undefined) {
if (this.localTrialConfig === undefined) {
throw new Error('trial config parsed failed');
}
if (this.localTrailConfig.gpuNum !== undefined) {
this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`);
if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) {
if (this.localTrialConfig.gpuNum !== undefined) {
this.log.info(`required GPU number is ${this.localTrialConfig.gpuNum}`);
if (this.gpuScheduler === undefined && this.localTrialConfig.gpuNum > 0) {
this.gpuScheduler = new GPUScheduler();
}
}
Expand Down Expand Up @@ -343,10 +343,10 @@ class LocalTrainingService implements TrainingService {
switch (key) {
case TrialConfigMetadataKey.TRIAL_CONFIG:
let getResult: Promise<string>;
if (this.localTrailConfig === undefined) {
if (this.localTrialConfig === undefined) {
getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`));
} else {
getResult = Promise.resolve(JSON.stringify(this.localTrailConfig));
getResult = Promise.resolve(JSON.stringify(this.localTrialConfig));
}

return getResult;
Expand Down Expand Up @@ -427,8 +427,8 @@ class LocalTrainingService implements TrainingService {
}

private tryGetAvailableResource(): [boolean, { gpuIndices: number[]}] {
if (this.localTrailConfig === undefined) {
throw new Error('localTrailConfig is not initialized!');
if (this.localTrialConfig === undefined) {
throw new Error('localTrialConfig is not initialized!');
}

const resource: { gpuIndices: number[] } = { gpuIndices: [] };
Expand All @@ -450,11 +450,11 @@ class LocalTrainingService implements TrainingService {
selectedGPUIndices = selectedGPUIndices.filter((index: number) => this.designatedGpuIndices.has(index));
}

if (selectedGPUIndices.length < this.localTrailConfig.gpuNum) {
if (selectedGPUIndices.length < this.localTrialConfig.gpuNum) {
return [false, resource];
}

selectedGPUIndices.splice(this.localTrailConfig.gpuNum);
selectedGPUIndices.splice(this.localTrialConfig.gpuNum);
Object.assign(resource, { gpuIndices: selectedGPUIndices });

return [true, resource];
Expand Down Expand Up @@ -512,17 +512,17 @@ class LocalTrainingService implements TrainingService {
}
}

private getScript(localTrailConfig: TrialConfig, workingDirectory: string): string[] {
private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
const script: string[] = [];
if (process.platform === 'win32') {
script.push(
`cmd /c ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`cmd /c ${localTrialConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File ${path.join(workingDirectory, '.nni', 'state')} -NoNewline -encoding utf8`);
} else {
script.push(
`eval ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`eval ${localTrialConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`echo $? \`date +%s%3N\` >${path.join(workingDirectory, '.nni', 'state')}`);
}

Expand All @@ -531,23 +531,23 @@ class LocalTrainingService implements TrainingService {

private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
if (this.localTrailConfig === undefined) {
if (this.localTrialConfig === undefined) {
throw new Error(`localTrialConfig not initialized!`);
}
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrailConfig.gpuNum);
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrialConfig.gpuNum);

if (this.localTrailConfig === undefined) {
if (this.localTrialConfig === undefined) {
throw new Error('trial config is not initialized');
}
const runScriptContent: string[] = [];
if (process.platform !== 'win32') {
runScriptContent.push('#!/bin/bash');
}
runScriptContent.push(`cd ${this.localTrailConfig.codeDir}`);
runScriptContent.push(`cd ${this.localTrialConfig.codeDir}`);
for (const variable of variables) {
runScriptContent.push(setEnvironmentVariable(variable));
}
const scripts: string[] = this.getScript(this.localTrailConfig, trialJobDetail.workingDirectory);
const scripts: string[] = this.getScript(this.localTrialConfig, trialJobDetail.workingDirectory);
scripts.forEach((script: string) => {
runScriptContent.push(script);
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,12 +511,16 @@ class RemoteMachineTrainingService implements TrainingService {
// tslint:disable-next-line: no-floating-promises
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn);

this.timer.subscribe(
const disposable: Rx.IDisposable = this.timer.subscribe(
async (tick: number) => {
const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand(
`tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn);
if (cmdresult !== undefined && cmdresult.stdout !== undefined) {
rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
if (rmMeta.gpuSummary.gpuCount === 0) {
this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`);
this.timer.unsubscribe(disposable);
}
}
}
);
Expand Down
39 changes: 28 additions & 11 deletions tools/nni_gpu_tool/gpu_metrics_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import subprocess
import sys
import time
import traceback

from xml.dom import minidom

Expand All @@ -33,7 +34,7 @@ def check_ready_to_run():
pidList.remove(os.getpid())
return len(pidList) == 0
else:
pgrep_output =subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pgrep_output = subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pidList.append(int(pid))
Expand All @@ -45,23 +46,21 @@ def main(argv):
if check_ready_to_run() == False:
# GPU metrics collector is already running. Exit
exit(2)
with open(os.path.join(metrics_output_dir, "gpu_metrics"), "w") as outputFile:
pass
os.chmod(os.path.join(metrics_output_dir, "gpu_metrics"), 0o777)
cmd = 'nvidia-smi -q -x'
cmd = 'nvidia-smi -q -x'.split()
while(True):
try:
smi_output = subprocess.check_output(cmd, shell=True)
parse_nvidia_smi_result(smi_output, metrics_output_dir)
except:
exception = sys.exc_info()
for e in exception:
print("job exporter error {}".format(e))
smi_output = subprocess.check_output(cmd)
except Exception:
traceback.print_exc()
gen_empty_gpu_metric(metrics_output_dir)
break
parse_nvidia_smi_result(smi_output, metrics_output_dir)
# TODO: change to sleep time configurable via arguments
time.sleep(5)

def parse_nvidia_smi_result(smi, outputDir):
try:
old_umask = os.umask(0)
xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu')
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
Expand All @@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir):
except :
e_info = sys.exc_info()
print('xmldoc paring error')
finally:
os.umask(old_umask)

def gen_empty_gpu_metric(outputDir):
try:
old_umask = os.umask(0)
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime())
outPut["gpuCount"] = 0
outPut["gpuInfos"] = []
print(outPut)
outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
outputFile.flush()
except Exception:
traceback.print_exc()
finally:
os.umask(old_umask)


if __name__ == "__main__":
Expand Down