Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Fix trial keeper wrongly exit issue #152

Merged
merged 5 commits into from
Sep 30, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions tools/trial_tool/trial_keeper.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def main_loop(args):
# Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior
process = Popen(args.trial_command, shell = True, stdout = stdout_file, stderr = stderr_file)
print('Subprocess pid is {}'.format(process.pid))
print('Current cwd is {}'.format(os.getcwd()))
while True:
retCode = process.poll()
## Read experiment metrics, to avoid missing metrics
Expand All @@ -55,15 +54,15 @@ def main_loop(args):
print('subprocess terminated. Exit code is {}. Quit'.format(retCode))
#copy local directory to hdfs
nni_local_output_dir = os.environ['NNI_OUTPUT_DIR']
hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name)
print(nni_local_output_dir, args.pai_hdfs_output_dir)
hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5)
try:
if copyDirectoryToHdfs(nni_local_output_dir, args.pai_hdfs_output_dir, hdfs_client):
print('copy directory success!')
print('copy directory from {0} to {1} success!'.format(nni_local_output_dir, args.pai_hdfs_output_dir))
else:
print('copy directory failed!')
print('copy directory from {0} to {1} failed!'.format(nni_local_output_dir, args.pai_hdfs_output_dir))
except Exception as exception:
print(exception)
print('HDFS copy directory got exception')
raise exception

## Exit as the retCode of subprocess(trial)
exit(retCode)
Expand Down Expand Up @@ -91,7 +90,10 @@ def trial_keeper_help_info(*args):

try:
main_loop(args)
except:
print('Exiting by user request')
except SystemExit as se:
print('NNI trial keeper exit with code {}'.format(se.code))
sys.exit(se.code)
except Exception as e:
print('Exit trial keeper with code 1 because Exception: {} is catched'.format(str(e)))
sys.exit(1)