From a272da9e5330c97126c22bd415ee3e08aebc121d Mon Sep 17 00:00:00 2001 From: J-shang <33053116+J-shang@users.noreply.github.com> Date: Wed, 10 Mar 2021 18:36:36 +0800 Subject: [PATCH] hotfix unhandled `TrainingService is not assigned` and extend exec time in pipeline (#3442) --- test/config/assessors/curvefitting.yml | 2 +- test/nni_test/nnitest/run_tests.py | 4 ++-- ts/nni_manager/core/nnimanager.ts | 30 +++++++++++++++----------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/test/config/assessors/curvefitting.yml b/test/config/assessors/curvefitting.yml index 66159c0dd1..ab6f4283ce 100644 --- a/test/config/assessors/curvefitting.yml +++ b/test/config/assessors/curvefitting.yml @@ -1,6 +1,6 @@ authorName: nni experimentName: default_test -maxExecDuration: 5m +maxExecDuration: 10m maxTrialNum: 8 trialConcurrency: 8 searchSpacePath: ../naive_trial/search_space.json diff --git a/test/nni_test/nnitest/run_tests.py b/test/nni_test/nnitest/run_tests.py index ef90c4f003..d0d285829a 100644 --- a/test/nni_test/nnitest/run_tests.py +++ b/test/nni_test/nnitest/run_tests.py @@ -260,9 +260,9 @@ def run(args): continue # remote mode need more time to cleanup if args.ts == 'remote': - wait_for_port_available(8080, 180) + wait_for_port_available(8080, 240) else: - wait_for_port_available(8080, 30) + wait_for_port_available(8080, 60) # adl mode need more time to cleanup PVC if args.ts == 'adl' and name == 'nnictl-resume-2': diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts index 997f25ee68..f7acaf5a58 100644 --- a/ts/nni_manager/core/nnimanager.ts +++ b/ts/nni_manager/core/nnimanager.ts @@ -326,22 +326,26 @@ class NNIManager implements Manager { } public async stopExperimentBottomHalf(): Promise { - const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs(); - - // DON'T try to make it in parallel, the training service may not handle it well. - // If there is performance concern, consider to support batch cancellation on training service. - for (const trialJob of trialJobList) { - if (trialJob.status === 'RUNNING' || - trialJob.status === 'WAITING') { - try { - this.log.info(`cancelTrialJob: ${trialJob.id}`); - await this.trainingService.cancelTrialJob(trialJob.id); - } catch (error) { - this.log.debug(`ignorable error on canceling trial ${trialJob.id}. ${error}`); + try { + const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs(); + + // DON'T try to make it in parallel, the training service may not handle it well. + // If there is performance concern, consider to support batch cancellation on training service. + for (const trialJob of trialJobList) { + if (trialJob.status === 'RUNNING' || + trialJob.status === 'WAITING') { + try { + this.log.info(`cancelTrialJob: ${trialJob.id}`); + await this.trainingService.cancelTrialJob(trialJob.id); + } catch (error) { + this.log.debug(`ignorable error on canceling trial ${trialJob.id}. ${error}`); + } } } + await this.trainingService.cleanUp(); + } catch (err) { + this.log.error(`${err.stack}`); } - await this.trainingService.cleanUp(); if (this.experimentProfile.endTime === undefined) { this.setEndtime(); }