Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
hotfix unhandled TrainingService is not assigned and extend exec ti…
Browse files Browse the repository at this point in the history
…me in pipeline (#3442)
  • Loading branch information
J-shang authored Mar 10, 2021
1 parent 62af469 commit a272da9
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 16 deletions.
2 changes: 1 addition & 1 deletion test/config/assessors/curvefitting.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
authorName: nni
experimentName: default_test
maxExecDuration: 5m
maxExecDuration: 10m
maxTrialNum: 8
trialConcurrency: 8
searchSpacePath: ../naive_trial/search_space.json
Expand Down
4 changes: 2 additions & 2 deletions test/nni_test/nnitest/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,9 @@ def run(args):
continue
# remote mode need more time to cleanup
if args.ts == 'remote':
wait_for_port_available(8080, 180)
wait_for_port_available(8080, 240)
else:
wait_for_port_available(8080, 30)
wait_for_port_available(8080, 60)

# adl mode need more time to cleanup PVC
if args.ts == 'adl' and name == 'nnictl-resume-2':
Expand Down
30 changes: 17 additions & 13 deletions ts/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -326,22 +326,26 @@ class NNIManager implements Manager {
}

public async stopExperimentBottomHalf(): Promise<void> {
const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();

// DON'T try to make it in parallel, the training service may not handle it well.
// If there is performance concern, consider to support batch cancellation on training service.
for (const trialJob of trialJobList) {
if (trialJob.status === 'RUNNING' ||
trialJob.status === 'WAITING') {
try {
this.log.info(`cancelTrialJob: ${trialJob.id}`);
await this.trainingService.cancelTrialJob(trialJob.id);
} catch (error) {
this.log.debug(`ignorable error on canceling trial ${trialJob.id}. ${error}`);
try {
const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();

// DON'T try to make it in parallel, the training service may not handle it well.
// If there is performance concern, consider to support batch cancellation on training service.
for (const trialJob of trialJobList) {
if (trialJob.status === 'RUNNING' ||
trialJob.status === 'WAITING') {
try {
this.log.info(`cancelTrialJob: ${trialJob.id}`);
await this.trainingService.cancelTrialJob(trialJob.id);
} catch (error) {
this.log.debug(`ignorable error on canceling trial ${trialJob.id}. ${error}`);
}
}
}
await this.trainingService.cleanUp();
} catch (err) {
this.log.error(`${err.stack}`);
}
await this.trainingService.cleanUp();
if (this.experimentProfile.endTime === undefined) {
this.setEndtime();
}
Expand Down

0 comments on commit a272da9

Please sign in to comment.