Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Fix remote reuse bugs #2981

Merged
merged 49 commits into from
Oct 20, 2020
Merged
Show file tree
Hide file tree
Changes from 44 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
5982baa
fix endpoint
Feb 10, 2020
5653624
add private key
Feb 10, 2020
e5ee726
Merge branch 'master' of https://github.com/microsoft/nni into dev-re…
Feb 10, 2020
1cb2349
fix torchversion
Feb 10, 2020
e53d4ec
Merge branch 'master' of https://github.com/microsoft/nni into dev-re…
Feb 11, 2020
99b0c08
add debug info
Feb 11, 2020
fe60ba5
add port in pscp.exe
Feb 11, 2020
52aa6ad
fix remote pipeline
Feb 13, 2020
533c504
Merge branch 'v1.4' of https://github.com/microsoft/nni into dev-remo…
Feb 13, 2020
8f024fa
Merge branch 'v1.4.1' of https://github.com/microsoft/nni into dev-re…
Mar 5, 2020
6a96403
fix remote-windows-pipeline
Mar 5, 2020
7afab04
Merge branch 'master' of https://github.com/microsoft/nni into dev-re…
SparkSnail Mar 31, 2020
13bb9f2
remove sudo
SparkSnail Mar 31, 2020
6e1d822
fix error
Mar 31, 2020
5a91061
Merge branch 'dev-remote-pipeline' of https://github.com/microsoft/nn…
Mar 31, 2020
4faeb06
format code
Mar 31, 2020
21aa3b5
fix error
Mar 31, 2020
ffa8c6b
fix error
Mar 31, 2020
4c49f60
debug
Mar 31, 2020
1057ad2
remove clean step
SparkSnail Apr 1, 2020
0a52185
format code
SparkSnail Apr 1, 2020
8b20632
Merge branch 'v1.5' of https://github.com/microsoft/nni into dev-remo…
SparkSnail Apr 4, 2020
f07f3f4
fix windows copy
SparkSnail Apr 4, 2020
939b1d4
fix conflict
SparkSnail Aug 13, 2020
1f78669
fix pipeline
SparkSnail Aug 13, 2020
7b7cadc
fix remote pipeline
SparkSnail Aug 13, 2020
96d41e4
Merge branch 'master' of https://github.com/microsoft/nni into dev-re…
SparkSnail Sep 4, 2020
04f5645
fix remote it
SparkSnail Sep 4, 2020
beafde2
format annotation
SparkSnail Sep 4, 2020
c3c6135
fix platform judge method
SparkSnail Sep 4, 2020
c0674dd
Merge branch 'v1.9' of https://github.com/microsoft/nni into dev-remo…
SparkSnail Oct 13, 2020
0e1bb25
add remote reuse pipeline
SparkSnail Oct 13, 2020
4a6e1d4
fix pipeline
SparkSnail Oct 13, 2020
9ed3cf4
update pipeline
SparkSnail Oct 13, 2020
4e57606
fix pipeline
SparkSnail Oct 13, 2020
257627f
remove unused code
SparkSnail Oct 13, 2020
d2967e5
format error message
SparkSnail Oct 13, 2020
9880f79
add more timeout for remote pipeline
SparkSnail Oct 14, 2020
bdbfa6e
debug kill process
SparkSnail Oct 14, 2020
0dd7c97
revert change
SparkSnail Oct 14, 2020
5cd22f0
fix check status
SparkSnail Oct 15, 2020
7e54be7
Merge branch 'v1.9' of https://github.com/microsoft/nni into dev-remo…
SparkSnail Oct 19, 2020
4dc2a41
fix v1.9
SparkSnail Oct 19, 2020
5cc6898
fix eslint
SparkSnail Oct 19, 2020
620e4ec
fix comments
SparkSnail Oct 19, 2020
fbf758a
add check file command
SparkSnail Oct 19, 2020
7c80d45
update
SparkSnail Oct 19, 2020
706b1b0
revert change
SparkSnail Oct 19, 2020
cb69b81
add annotation
SparkSnail Oct 20, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -137,40 +137,37 @@ export class RemoteEnvironmentService extends EnvironmentService {

private async refreshEnvironment(environment: EnvironmentInformation): Promise<void> {
const executor = await this.getExecutor(environment.id);
const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`;
const runnerReturnCodeFilePath: string = `${environment.runnerWorkingFolder}/code`;
if (fs.existsSync(jobpidPath)) {
/* eslint-disable require-atomic-updates */
try {
const isAlive = await executor.isProcessAlive(jobpidPath);
// if the process of jobpid is not alive any more
if (!isAlive) {
const remoteEnvironment: RemoteMachineEnvironmentInformation = environment as RemoteMachineEnvironmentInformation;
if (remoteEnvironment.rmMachineMeta === undefined) {
throw new Error(`${remoteEnvironment.id} machine meta not initialized!`);
}
this.log.info(`pid in ${remoteEnvironment.rmMachineMeta.ip}:${jobpidPath} is not alive!`);
if (fs.existsSync(runnerReturnCodeFilePath)) {
const runnerReturnCode: string = await executor.getRemoteFileContent(runnerReturnCodeFilePath);
const match: RegExpMatchArray | null = runnerReturnCode.trim()
.match(/^-?(\d+)\s+(\d+)$/);
if (match !== null) {
const { 1: code } = match;
// Update trial job's status based on result code
if (parseInt(code, 10) === 0) {
environment.setStatus('SUCCEEDED');
} else {
environment.setStatus('FAILED');
}
this.releaseEnvironmentResource(environment);
}
const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`;
const runnerReturnCodeFilePath: string = `${environment.runnerWorkingFolder}/code`;
/* eslint-disable require-atomic-updates */
try {
const isAlive = await executor.isProcessAlive(jobpidPath);
// if the process of jobpid is not alive any more
if (!isAlive) {
const remoteEnvironment: RemoteMachineEnvironmentInformation = environment as RemoteMachineEnvironmentInformation;
if (remoteEnvironment.rmMachineMeta === undefined) {
throw new Error(`${remoteEnvironment.id} machine meta not initialized!`);
}
this.log.info(`pid in ${remoteEnvironment.rmMachineMeta.ip}:${jobpidPath} is not alive!`);
if (fs.existsSync(runnerReturnCodeFilePath)) {
const runnerReturnCode: string = await executor.getRemoteFileContent(runnerReturnCodeFilePath);
const match: RegExpMatchArray | null = runnerReturnCode.trim()
.match(/^-?(\d+)\s+(\d+)$/);
if (match !== null) {
const { 1: code } = match;
// Update trial job's status based on result code
if (parseInt(code, 10) === 0) {
environment.setStatus('SUCCEEDED');
} else {
environment.setStatus('FAILED');
}
this.releaseEnvironmentResource(environment);
}
} catch (error) {
this.releaseEnvironmentResource(environment);
this.log.error(`Update job status exception, error is ${error.message}`);
}
}
} catch (error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to release environment resource here, why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a case that environment is submitted, but it starts slowly and hasn't start process and create pid file, the system call refresh function to read pid file, it will cause no such file exception.

Copy link
Member

@squirrelsc squirrelsc Oct 19, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, it looks you don't need to check this file. Check if environment.isRunnerReady, then check the file. It will depend on first initialized message. And you set env status to running is too earlier. In remote, it's better to wait isRunnerReady first, then check file status, and set to running, success or failed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, if the environment is failed to start, the isRunnerReady will always be false, but we need to refresh env status to failed here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you know it failed to start? You may can wait the pid file when initializing, instead of set env to running directly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check process return code to detect if env is failed to start. Added detecting logic for pid file exist.

this.log.error(`Update job status exception, error is ${error.message}`);
}
}

public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> {
Expand Down Expand Up @@ -245,6 +242,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
'envs', environment.id)
environment.command = `cd ${environment.runnerWorkingFolder} && \
${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \
1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \
&& echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`;
return Promise.resolve(true);
}
Expand Down
2 changes: 2 additions & 0 deletions test/config/training_service.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ pai:
containerNFSMountPath:
paiStorageConfigName:
remote:
remoteConfig:
reuse: false
machineList:
- ip:
passwd:
Expand Down
3 changes: 3 additions & 0 deletions test/nni_test/nnitest/generate_ts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def update_training_service_config(args):
config[args.ts]['machineList'][0]['port'] = args.remote_port
if args.remote_pwd is not None:
config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd
if args.remote_reuse is not None:
config[args.ts]['remoteConfig']['reuse'] = args.remote_reuse.lower() == 'true'

dump_yml_content(TRAINING_SERVICE_FILE, config)

Expand Down Expand Up @@ -119,6 +121,7 @@ def update_training_service_config(args):
parser.add_argument("--remote_pwd", type=str)
parser.add_argument("--remote_host", type=str)
parser.add_argument("--remote_port", type=int)
parser.add_argument("--remote_reuse", type=str)
args = parser.parse_args()

update_training_service_config(args)
2 changes: 1 addition & 1 deletion test/pipelines/pipelines-it-remote-linux-to-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:
- script: |
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py --ts remote --remote_user $(docker_user) --remote_host $(remote_host) \
python3 nni_test/nnitest/generate_ts_config.py --ts remote --remote_reuse $(remote_reuse) --remote_user $(docker_user) --remote_host $(remote_host) \
--remote_port $(cat port) --remote_pwd $(docker_pwd) --nni_manager_ip $(nni_manager_ip)
cat config/training_service.yml
PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
Expand Down
11 changes: 7 additions & 4 deletions tools/nni_trial_tool/trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,13 @@ def is_running(self):
def kill(self, trial_id=None):
if trial_id == self.id or trial_id is None:
if self.process is not None:
nni_log(LogType.Info, "%s: killing trial" % self.name)
for child in psutil.Process(self.process.pid).children(True):
child.kill()
self.process.kill()
try:
nni_log(LogType.Info, "%s: killing trial" % self.name)
for child in psutil.Process(self.process.pid).children(True):
child.kill()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there is try catch, it's better to try catch for each kill. So that one fail won't effect others.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error is used to catch psutil.Process(), not child kill. In some kind of scene, trial has already exited, and kill() command is sent later. Will throw process not exist error.

self.process.kill()
except Exception as ex:
Copy link
Contributor

@liuzhe-lz liuzhe-lz Oct 19, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we catch a more specific exception?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, add a NoSuchProcess.

nni_log(LogType.Error, "kill trial %s failed: %s " % (trial_id, str(ex)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It likes a clean up, don't need error level log. debug or info is enough.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

psutil.NoSuchProcess is expected exit issue, use info level. For other kinds of unexpected issue, I think use error level is better.

self.cleanup()

def cleanup(self):
Expand Down