From 533d8bdf95f1873cdb4157f81fe51d0d0c62acc8 Mon Sep 17 00:00:00 2001 From: Ning Shang Date: Thu, 10 Dec 2020 15:07:08 +0800 Subject: [PATCH 1/7] change SIGKILL to SIGTERM in local cancel trial job --- .../training_service/local/localTrainingService.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ts/nni_manager/training_service/local/localTrainingService.ts b/ts/nni_manager/training_service/local/localTrainingService.ts index ab9d037f99..8c18437118 100644 --- a/ts/nni_manager/training_service/local/localTrainingService.ts +++ b/ts/nni_manager/training_service/local/localTrainingService.ts @@ -253,7 +253,14 @@ class LocalTrainingService implements TrainingService { return Promise.resolve(); } - tkill(trialJob.pid, 'SIGKILL'); + tkill(trialJob.pid, 'SIGTERM'); + const pid = trialJob.pid; + setTimeout(((pid: number) => { + tkill(pid, 'SIGKILL', (err) => { + this.log.warning(`cancel trial job {pid: ${pid}} failed: ${err?.message}`); + }); + }).bind(this), 5 * 1000, pid); + this.setTrialJobStatus(trialJob, getJobCancelStatus(isEarlyStopped)); return Promise.resolve(); From ed87bc14aa3cb7959fa77890567ad265c07af7b3 Mon Sep 17 00:00:00 2001 From: Ning Shang Date: Thu, 10 Dec 2020 15:16:26 +0800 Subject: [PATCH 2/7] fix bug --- ts/nni_manager/training_service/local/localTrainingService.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ts/nni_manager/training_service/local/localTrainingService.ts b/ts/nni_manager/training_service/local/localTrainingService.ts index 8c18437118..0c18fa518c 100644 --- a/ts/nni_manager/training_service/local/localTrainingService.ts +++ b/ts/nni_manager/training_service/local/localTrainingService.ts @@ -257,7 +257,9 @@ class LocalTrainingService implements TrainingService { const pid = trialJob.pid; setTimeout(((pid: number) => { tkill(pid, 'SIGKILL', (err) => { - this.log.warning(`cancel trial job {pid: ${pid}} failed: ${err?.message}`); + if (err){ + this.log.warning(`cancel trial job {pid: ${pid}} failed: ${err.message}`); + } }); }).bind(this), 5 * 1000, pid); From fe65dfaf81b3b37196d5253d7a30f5d6fecdef96 Mon Sep 17 00:00:00 2001 From: Ning Shang Date: Thu, 10 Dec 2020 15:25:50 +0800 Subject: [PATCH 3/7] fix eslint --- ts/nni_manager/training_service/local/localTrainingService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ts/nni_manager/training_service/local/localTrainingService.ts b/ts/nni_manager/training_service/local/localTrainingService.ts index 0c18fa518c..63b362e6c8 100644 --- a/ts/nni_manager/training_service/local/localTrainingService.ts +++ b/ts/nni_manager/training_service/local/localTrainingService.ts @@ -255,7 +255,7 @@ class LocalTrainingService implements TrainingService { } tkill(trialJob.pid, 'SIGTERM'); const pid = trialJob.pid; - setTimeout(((pid: number) => { + setTimeout(((pid: number): void => { tkill(pid, 'SIGKILL', (err) => { if (err){ this.log.warning(`cancel trial job {pid: ${pid}} failed: ${err.message}`); From 36a09321df56f0a6e1981ced6307945825b80028 Mon Sep 17 00:00:00 2001 From: Ning Shang Date: Thu, 10 Dec 2020 16:31:25 +0800 Subject: [PATCH 4/7] reduce delay duration --- ts/nni_manager/training_service/local/localTrainingService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ts/nni_manager/training_service/local/localTrainingService.ts b/ts/nni_manager/training_service/local/localTrainingService.ts index 63b362e6c8..b43747096e 100644 --- a/ts/nni_manager/training_service/local/localTrainingService.ts +++ b/ts/nni_manager/training_service/local/localTrainingService.ts @@ -261,7 +261,7 @@ class LocalTrainingService implements TrainingService { this.log.warning(`cancel trial job {pid: ${pid}} failed: ${err.message}`); } }); - }).bind(this), 5 * 1000, pid); + }).bind(this), 1000, pid); this.setTrialJobStatus(trialJob, getJobCancelStatus(isEarlyStopped)); From 41a40ece6314520e2e4427f1f17e108dc0b79d48 Mon Sep 17 00:00:00 2001 From: Ning Shang Date: Mon, 14 Dec 2020 09:47:50 +0800 Subject: [PATCH 5/7] change to loop check alive --- .../local/localTrainingService.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ts/nni_manager/training_service/local/localTrainingService.ts b/ts/nni_manager/training_service/local/localTrainingService.ts index b43747096e..a6d2264377 100644 --- a/ts/nni_manager/training_service/local/localTrainingService.ts +++ b/ts/nni_manager/training_service/local/localTrainingService.ts @@ -254,14 +254,14 @@ class LocalTrainingService implements TrainingService { return Promise.resolve(); } tkill(trialJob.pid, 'SIGTERM'); - const pid = trialJob.pid; - setTimeout(((pid: number): void => { - tkill(pid, 'SIGKILL', (err) => { - if (err){ - this.log.warning(`cancel trial job {pid: ${pid}} failed: ${err.message}`); - } - }); - }).bind(this), 1000, pid); + let waitingTime = 0; + while(await isAlive(trialJob.pid)) { + if (waitingTime > 4999) { + tkill(trialJob.pid, 'SIGKILL'); + } + await delay(500); + waitingTime += 500; + } this.setTrialJobStatus(trialJob, getJobCancelStatus(isEarlyStopped)); From ae8bf16d11d677900ff90e6561c96207bbe1dc53 Mon Sep 17 00:00:00 2001 From: J-shang Date: Tue, 15 Dec 2020 13:17:57 +0800 Subject: [PATCH 6/7] add break --- .../training_service/local/localTrainingService.ts | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ts/nni_manager/training_service/local/localTrainingService.ts b/ts/nni_manager/training_service/local/localTrainingService.ts index a6d2264377..0984d1b3c5 100644 --- a/ts/nni_manager/training_service/local/localTrainingService.ts +++ b/ts/nni_manager/training_service/local/localTrainingService.ts @@ -254,13 +254,17 @@ class LocalTrainingService implements TrainingService { return Promise.resolve(); } tkill(trialJob.pid, 'SIGTERM'); - let waitingTime = 0; + let startTime = Date.now(); while(await isAlive(trialJob.pid)) { - if (waitingTime > 4999) { - tkill(trialJob.pid, 'SIGKILL'); + if (Date.now() - startTime > 4999) { + tkill(trialJob.pid, 'SIGKILL', (err) => { + if (err) { + this.log.error(`kill trial job error: ${err}`); + } + }); + break; } await delay(500); - waitingTime += 500; } this.setTrialJobStatus(trialJob, getJobCancelStatus(isEarlyStopped)); From c2ff7e6fe395c3d14fcdff06d88fc41bd372ee51 Mon Sep 17 00:00:00 2001 From: J-shang Date: Tue, 15 Dec 2020 13:35:05 +0800 Subject: [PATCH 7/7] fix eslint --- ts/nni_manager/training_service/local/localTrainingService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ts/nni_manager/training_service/local/localTrainingService.ts b/ts/nni_manager/training_service/local/localTrainingService.ts index 0984d1b3c5..fb2b81e2ad 100644 --- a/ts/nni_manager/training_service/local/localTrainingService.ts +++ b/ts/nni_manager/training_service/local/localTrainingService.ts @@ -254,7 +254,7 @@ class LocalTrainingService implements TrainingService { return Promise.resolve(); } tkill(trialJob.pid, 'SIGTERM'); - let startTime = Date.now(); + const startTime = Date.now(); while(await isAlive(trialJob.pid)) { if (Date.now() - startTime > 4999) { tkill(trialJob.pid, 'SIGKILL', (err) => {