From c5e26efdcf15879e379977d14fab9e050152f814 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 19 Jul 2020 23:44:28 +0800 Subject: [PATCH 01/10] add trial job detail link --- src/nni_manager/training_service/pai/paiConfig.ts | 4 +++- src/nni_manager/training_service/pai/paiJobInfoCollector.ts | 2 +- .../training_service/pai/paiK8S/paiK8STrainingService.ts | 4 +++- src/nni_manager/training_service/reusable/trialDispatcher.ts | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts index eceda619c7..fa38d007d7 100644 --- a/src/nni_manager/training_service/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -45,9 +45,10 @@ export class PAITrialJobDetail implements TrialJobDetail { public form: TrialJobApplicationForm; public logPath: string; public isEarlyStopped?: boolean; + public paiJobDetailUrl?: string; constructor(id: string, status: TrialJobStatus, paiJobName: string, - submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string) { + submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string, paiJobDetailUrl?: string) { this.id = id; this.status = status; this.paiJobName = paiJobName; @@ -56,5 +57,6 @@ export class PAITrialJobDetail implements TrialJobDetail { this.form = form; this.tags = []; this.logPath = logPath; + this.paiJobDetailUrl = paiJobDetailUrl; } } diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts index eb15765a4f..2590547849 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -84,7 +84,7 @@ export class PAIJobInfoCollector { if (response.body.jobStatus.appTrackingUrl) { paiTrialJob.url = response.body.jobStatus.appTrackingUrl; } else { - paiTrialJob.url = paiTrialJob.logPath; + paiTrialJob.url = paiTrialJob.paiJobDetailUrl; } } break; diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index e243387d39..f046fcda5c 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -123,6 +123,7 @@ class PAIK8STrainingService extends PAITrainingService { const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const logPath: string = path.join(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId, trialJobId); + const paiJobDetailUrl: string = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${paiJobName}`; const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, 'WAITING', @@ -130,7 +131,8 @@ class PAIK8STrainingService extends PAITrainingService { Date.now(), trialWorkingFolder, form, - logPath); + logPath, + paiJobDetailUrl); this.trialJobsMap.set(trialJobId, trialJobDetail); this.jobQueue.push(trialJobId); diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 156909e129..ff324899c6 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -362,6 +362,7 @@ class TrialDispatcher implements TrainingService { liveTrialsCount++; continue; } + trial.url = environment.trackingUrl; const environmentStatus = environment.status; // any node exit, then make sure the whole trial stopped. From 785a324ca162f8c3ece138b9666c88008aec459f Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 19 Jul 2020 23:54:01 +0800 Subject: [PATCH 02/10] Revert "add trial job detail link" This reverts commit c5e26efdcf15879e379977d14fab9e050152f814. --- src/nni_manager/training_service/pai/paiConfig.ts | 4 +--- src/nni_manager/training_service/pai/paiJobInfoCollector.ts | 2 +- .../training_service/pai/paiK8S/paiK8STrainingService.ts | 4 +--- src/nni_manager/training_service/reusable/trialDispatcher.ts | 1 - 4 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts index fa38d007d7..eceda619c7 100644 --- a/src/nni_manager/training_service/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -45,10 +45,9 @@ export class PAITrialJobDetail implements TrialJobDetail { public form: TrialJobApplicationForm; public logPath: string; public isEarlyStopped?: boolean; - public paiJobDetailUrl?: string; constructor(id: string, status: TrialJobStatus, paiJobName: string, - submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string, paiJobDetailUrl?: string) { + submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string) { this.id = id; this.status = status; this.paiJobName = paiJobName; @@ -57,6 +56,5 @@ export class PAITrialJobDetail implements TrialJobDetail { this.form = form; this.tags = []; this.logPath = logPath; - this.paiJobDetailUrl = paiJobDetailUrl; } } diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts index 2590547849..eb15765a4f 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -84,7 +84,7 @@ export class PAIJobInfoCollector { if (response.body.jobStatus.appTrackingUrl) { paiTrialJob.url = response.body.jobStatus.appTrackingUrl; } else { - paiTrialJob.url = paiTrialJob.paiJobDetailUrl; + paiTrialJob.url = paiTrialJob.logPath; } } break; diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index f046fcda5c..e243387d39 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -123,7 +123,6 @@ class PAIK8STrainingService extends PAITrainingService { const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const logPath: string = path.join(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId, trialJobId); - const paiJobDetailUrl: string = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${paiJobName}`; const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, 'WAITING', @@ -131,8 +130,7 @@ class PAIK8STrainingService extends PAITrainingService { Date.now(), trialWorkingFolder, form, - logPath, - paiJobDetailUrl); + logPath); this.trialJobsMap.set(trialJobId, trialJobDetail); this.jobQueue.push(trialJobId); diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index ff324899c6..156909e129 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -362,7 +362,6 @@ class TrialDispatcher implements TrainingService { liveTrialsCount++; continue; } - trial.url = environment.trackingUrl; const environmentStatus = environment.status; // any node exit, then make sure the whole trial stopped. From 0f0567cadd7c6f0e83eceaa747724c502329b6bb Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 19 Jul 2020 23:58:37 +0800 Subject: [PATCH 03/10] fix webchannel connection --- tools/nni_trial_tool/web_channel.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py index 752a303cb0..2bd5a3a760 100644 --- a/tools/nni_trial_tool/web_channel.py +++ b/tools/nni_trial_tool/web_channel.py @@ -24,8 +24,11 @@ def __init__(self, args): def _inner_open(self): url = "ws://{}:{}".format(self.args.nnimanager_ip, self.args.nnimanager_port) nni_log(LogType.Info, 'WebChannel: connected with info %s' % url) - - connect = websockets.connect(url) + try: + connect = websockets.connect(url) + except Exception as ex: + nni_log(LogType.ERROR, 'WebChannel: create connection failed %s' % ex) + exit(1) self._event_loop = asyncio.get_event_loop() client = self._event_loop.run_until_complete(connect) self.client = client From bcab8a9bb28f8dab48ad786b56d5b3fdf018c54e Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Mon, 20 Jul 2020 09:53:26 +0800 Subject: [PATCH 04/10] refactor logic --- tools/nni_trial_tool/web_channel.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py index 2bd5a3a760..33ef8b2305 100644 --- a/tools/nni_trial_tool/web_channel.py +++ b/tools/nni_trial_tool/web_channel.py @@ -24,11 +24,7 @@ def __init__(self, args): def _inner_open(self): url = "ws://{}:{}".format(self.args.nnimanager_ip, self.args.nnimanager_port) nni_log(LogType.Info, 'WebChannel: connected with info %s' % url) - try: - connect = websockets.connect(url) - except Exception as ex: - nni_log(LogType.ERROR, 'WebChannel: create connection failed %s' % ex) - exit(1) + connect = websockets.connect(url) self._event_loop = asyncio.get_event_loop() client = self._event_loop.run_until_complete(connect) self.client = client @@ -43,7 +39,11 @@ def _inner_close(self): def _inner_send(self, message): loop = asyncio.new_event_loop() - loop.run_until_complete(self.client.send(message)) + try: + loop.run_until_complete(self.client.send(message)) + except Exception as ex: + nni_log(LogType.ERROR, 'WebChannel: send message failed %s' % ex) + exit(1) def _inner_receive(self): messages = [] From 15ea95afd86c5e87d7f6c4c43ee4be0be274e685 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 21 Jul 2020 20:10:51 +0800 Subject: [PATCH 05/10] add timeout --- tools/nni_trial_tool/web_channel.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py index 33ef8b2305..f1354f1feb 100644 --- a/tools/nni_trial_tool/web_channel.py +++ b/tools/nni_trial_tool/web_channel.py @@ -16,6 +16,7 @@ def __init__(self, args): self.args = args self.client = None self.in_cache = b"" + self.timeout = 10 super(WebChannel, self).__init__(args) @@ -39,10 +40,11 @@ def _inner_close(self): def _inner_send(self, message): loop = asyncio.new_event_loop() + send = asyncio.wait_for(self.client.send(message), self.timeout) try: - loop.run_until_complete(self.client.send(message)) - except Exception as ex: - nni_log(LogType.ERROR, 'WebChannel: send message failed %s' % ex) + loop.run_until_complete(send) + except asyncio.exceptions.TimeoutError: + nni_log(LogType.ERROR, 'WebChannel: send message to %s:%s failed!' % (self.args.nnimanager_ip, self.args.nnimanager_port)) exit(1) def _inner_receive(self): From a3b9d391bc791487b1c74433791f7c58c27f29b0 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 21 Jul 2020 20:51:40 +0800 Subject: [PATCH 06/10] format --- tools/nni_trial_tool/web_channel.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py index f1354f1feb..6b43beffec 100644 --- a/tools/nni_trial_tool/web_channel.py +++ b/tools/nni_trial_tool/web_channel.py @@ -16,7 +16,7 @@ def __init__(self, args): self.args = args self.client = None self.in_cache = b"" - self.timeout = 10 + self.timeout = 5 super(WebChannel, self).__init__(args) @@ -24,11 +24,15 @@ def __init__(self, args): def _inner_open(self): url = "ws://{}:{}".format(self.args.nnimanager_ip, self.args.nnimanager_port) - nni_log(LogType.Info, 'WebChannel: connected with info %s' % url) - connect = websockets.connect(url) - self._event_loop = asyncio.get_event_loop() - client = self._event_loop.run_until_complete(connect) - self.client = client + try: + connect = asyncio.wait_for(websockets.connect(url), self.timeout) + self._event_loop = asyncio.get_event_loop() + client = self._event_loop.run_until_complete(connect) + self.client = client + nni_log(LogType.Info, 'WebChannel: connected with info %s' % url) + except asyncio.TimeoutError: + nni_log(LogType.Error, 'WebChannel: connect %s failed!' % url) + exit(1) def _inner_close(self): if self.client is not None: @@ -40,12 +44,7 @@ def _inner_close(self): def _inner_send(self, message): loop = asyncio.new_event_loop() - send = asyncio.wait_for(self.client.send(message), self.timeout) - try: - loop.run_until_complete(send) - except asyncio.exceptions.TimeoutError: - nni_log(LogType.ERROR, 'WebChannel: send message to %s:%s failed!' % (self.args.nnimanager_ip, self.args.nnimanager_port)) - exit(1) + loop.run_until_complete(self.client.send(message)) def _inner_receive(self): messages = [] From 4a8dbcb16ac678bc0300b14952ef025a783ebb3d Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 21 Jul 2020 20:53:12 +0800 Subject: [PATCH 07/10] reset timeout value --- tools/nni_trial_tool/web_channel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py index 6b43beffec..a5ed38bb94 100644 --- a/tools/nni_trial_tool/web_channel.py +++ b/tools/nni_trial_tool/web_channel.py @@ -16,7 +16,7 @@ def __init__(self, args): self.args = args self.client = None self.in_cache = b"" - self.timeout = 5 + self.timeout = 10 super(WebChannel, self).__init__(args) From 6b9dd7137caad7f6edc843a2f21a2b4271f26587 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 23 Jul 2020 10:44:14 +0800 Subject: [PATCH 08/10] fix comments --- tools/nni_trial_tool/web_channel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py index a5ed38bb94..e759f36942 100644 --- a/tools/nni_trial_tool/web_channel.py +++ b/tools/nni_trial_tool/web_channel.py @@ -31,7 +31,7 @@ def _inner_open(self): self.client = client nni_log(LogType.Info, 'WebChannel: connected with info %s' % url) except asyncio.TimeoutError: - nni_log(LogType.Error, 'WebChannel: connect %s failed!' % url) + nni_log(LogType.Error, 'connect to %s timeout! Please make sure NNIManagerIP configured correclty, and accessable.' % url) exit(1) def _inner_close(self): From ab8e130e861009f7bcf820df204e3e86797095c2 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 23 Jul 2020 10:53:14 +0800 Subject: [PATCH 09/10] unify code to use os._exit(1) --- tools/nni_trial_tool/web_channel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py index e759f36942..db3f038637 100644 --- a/tools/nni_trial_tool/web_channel.py +++ b/tools/nni_trial_tool/web_channel.py @@ -2,7 +2,7 @@ # Licensed under the MIT license. import asyncio - +import os import websockets from .base_channel import BaseChannel @@ -32,7 +32,7 @@ def _inner_open(self): nni_log(LogType.Info, 'WebChannel: connected with info %s' % url) except asyncio.TimeoutError: nni_log(LogType.Error, 'connect to %s timeout! Please make sure NNIManagerIP configured correclty, and accessable.' % url) - exit(1) + os._exit(1) def _inner_close(self): if self.client is not None: From b8fbc9d3aa55279167dc5e2504ad2d4f1439c635 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 24 Jul 2020 15:47:42 +0800 Subject: [PATCH 10/10] fix typo --- tools/nni_trial_tool/web_channel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py index db3f038637..a68be47908 100644 --- a/tools/nni_trial_tool/web_channel.py +++ b/tools/nni_trial_tool/web_channel.py @@ -31,7 +31,7 @@ def _inner_open(self): self.client = client nni_log(LogType.Info, 'WebChannel: connected with info %s' % url) except asyncio.TimeoutError: - nni_log(LogType.Error, 'connect to %s timeout! Please make sure NNIManagerIP configured correclty, and accessable.' % url) + nni_log(LogType.Error, 'connect to %s timeout! Please make sure NNIManagerIP configured correctly, and accessable.' % url) os._exit(1) def _inner_close(self):