Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Improve stablability of remote training service. (#2474)
Browse files Browse the repository at this point in the history
  • Loading branch information
squirrelsc authored May 25, 2020
1 parent e640ad6 commit be09f11
Show file tree
Hide file tree
Showing 7 changed files with 22 additions and 34 deletions.
2 changes: 1 addition & 1 deletion src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ class NNIManager implements Manager {
assert(this.status.status === 'RUNNING' ||
this.status.status === 'DONE' ||
this.status.status === 'NO_MORE_TRIAL' ||
this.status.status === 'TUNER_NO_MORE_TRIAL');
this.status.status === 'TUNER_NO_MORE_TRIAL', `Actual status: ${this.status.status}`);
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
if (this.status.status !== 'DONE') {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands {
return result;
}

public killChildProcesses(pidFileName: string): string {
public killChildProcesses(pidFileName: string, killSelf: boolean): string {
// prevent trialkeeper to be killed, so it can save exit code.
const command = `list_descendants ()
let command = `list_descendants ()
{
local children=$(ps -o pid= --ppid "$1")
Expand All @@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands {
echo "$children"
}
kill $(list_descendants \`cat '${pidFileName}'\`)`
if (killSelf) {
command += `\nkill \`cat '${pidFileName}'\``
}
return command;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands {
return result;
}

public killChildProcesses(pidFileName: string): string {
const command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
public killChildProcesses(pidFileName: string, killSelf: boolean): string {
let command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
`Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; ` +
`if ($subppid -ne $ppid){Stop-Process -Id $subppid}}` +
`if ($subppid -ne $ppid){Stop-Process -Id $subppid -Force"}}` +
`kill-tree $ppid"`;
if (killSelf){
command += `;Stop-Process -Id $ppid`;
}
return command;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ abstract class OsCommands {
public abstract readLastLines(fileName: string, lineCount: number): string;
public abstract isProcessAliveCommand(pidFileName: string): string;
public abstract isProcessAliveProcessOutput(result: RemoteCommandResult): boolean;
public abstract killChildProcesses(pidFileName: string): string;
public abstract killChildProcesses(pidFileName: string, killSelf: boolean): string;
public abstract extractFile(tarFileName: string, targetFolder: string): string;
public abstract executeScript(script: string, isFile: boolean): string;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService {
}
}
if (restServer.getErrorMessage !== undefined) {
throw new Error(restServer.getErrorMessage);
this.stopping = true;
throw new Error(restServer.getErrorMessage);
}
await delay(3000);
}
Expand Down Expand Up @@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService {
if (executor !== undefined) {
this.log.info(`killing gpu metric collector on ${executor.name}`);
const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid');
await executor.killChildProcesses(gpuJobPidPath);
await executor.killChildProcesses(gpuJobPidPath, true);
}
executorManager.releaseAllExecutor();
}
Expand Down Expand Up @@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService {
this.timer.unsubscribe(disposable);
}
}
if (this.stopping){
this.timer.unsubscribe(disposable);
this.log.debug(`Stopped GPU collector on ${rmMeta.ip}, since experiment is exiting.`);
}
collectingCount.pop();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,8 @@ class ShellExecutor {
return result !== undefined ? result : false;
}

public async killChildProcesses(pidFileName: string): Promise<boolean> {
const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName);
public async killChildProcesses(pidFileName: string, killSelf: boolean = false): Promise<boolean> {
const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName, killSelf);
const commandResult = await this.execute(commandText);
return commandResult.exitCode == 0;
}
Expand Down
24 changes: 1 addition & 23 deletions tools/nni_gpu_tool/gpu_metrics_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,9 @@
from xml.dom import minidom


def check_ready_to_run():
if sys.platform == 'win32':
pgrep_output = subprocess.check_output(
'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
pidList = pgrep_output.decode("utf-8").strip().split()
pidList.pop(0) # remove the key word 'ProcessId'
pidList = list(map(int, pidList))
pidList.remove(os.getpid())
return not pidList
else:
pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pid = pid.decode()
if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
continue
pidList.append(pid)
return not pidList


def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
if check_ready_to_run() == False:
print("GPU metrics collector is already running. exiting...")
exit(2)

cmd = 'nvidia-smi -q -x'.split()
while(True):
try:
Expand Down

0 comments on commit be09f11

Please sign in to comment.