microsoft · SparkSnail · Oct 20, 2020 · Feb 10, 2020 · Feb 10, 2020 · Feb 10, 2020
diff --git a/src/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
@@ -137,40 +137,37 @@ export class RemoteEnvironmentService extends EnvironmentService {
 
     private async refreshEnvironment(environment: EnvironmentInformation): Promise<void> {
         const executor = await this.getExecutor(environment.id);
-            const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`;
-            const runnerReturnCodeFilePath: string = `${environment.runnerWorkingFolder}/code`;
-            if (fs.existsSync(jobpidPath)) {
-                /* eslint-disable require-atomic-updates */
-                try {
-                    const isAlive = await executor.isProcessAlive(jobpidPath);
-                    // if the process of jobpid is not alive any more
-                    if (!isAlive) {
-                        const remoteEnvironment: RemoteMachineEnvironmentInformation = environment as RemoteMachineEnvironmentInformation;
-                        if (remoteEnvironment.rmMachineMeta === undefined) {
-                            throw new Error(`${remoteEnvironment.id} machine meta not initialized!`);
-                        }
-                        this.log.info(`pid in ${remoteEnvironment.rmMachineMeta.ip}:${jobpidPath} is not alive!`);
-                        if (fs.existsSync(runnerReturnCodeFilePath)) {
-                            const runnerReturnCode: string = await executor.getRemoteFileContent(runnerReturnCodeFilePath);
-                            const match: RegExpMatchArray | null = runnerReturnCode.trim()
-                                .match(/^-?(\d+)\s+(\d+)$/);
-                            if (match !== null) {
-                                const { 1: code } = match;
-                                // Update trial job's status based on result code
-                                if (parseInt(code, 10) === 0) {
-                                    environment.setStatus('SUCCEEDED');
-                                } else {
-                                    environment.setStatus('FAILED');
-                                }
-                                this.releaseEnvironmentResource(environment);
-                            }
+        const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`;
+        const runnerReturnCodeFilePath: string = `${environment.runnerWorkingFolder}/code`;
+        /* eslint-disable require-atomic-updates */
+        try {
+            const isAlive = await executor.isProcessAlive(jobpidPath);
+            // if the process of jobpid is not alive any more
+            if (!isAlive) {
+                const remoteEnvironment: RemoteMachineEnvironmentInformation = environment as RemoteMachineEnvironmentInformation;
+                if (remoteEnvironment.rmMachineMeta === undefined) {
+                    throw new Error(`${remoteEnvironment.id} machine meta not initialized!`);
+                }
+                this.log.info(`pid in ${remoteEnvironment.rmMachineMeta.ip}:${jobpidPath} is not alive!`);
+                if (fs.existsSync(runnerReturnCodeFilePath)) {
+                    const runnerReturnCode: string = await executor.getRemoteFileContent(runnerReturnCodeFilePath);
+                    const match: RegExpMatchArray | null = runnerReturnCode.trim()
+                        .match(/^-?(\d+)\s+(\d+)$/);
+                    if (match !== null) {
+                        const { 1: code } = match;
+                        // Update trial job's status based on result code
+                        if (parseInt(code, 10) === 0) {
+                            environment.setStatus('SUCCEEDED');
+                        } else {
+                            environment.setStatus('FAILED');
                         }
+                        this.releaseEnvironmentResource(environment);
                     }
-                } catch (error) {
-                    this.releaseEnvironmentResource(environment);
-                    this.log.error(`Update job status exception, error is ${error.message}`);
                 }
             }
+        } catch (error) {
+            this.log.error(`Update job status exception, error is ${error.message}`);
+        }
     }
 
     public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> {
@@ -245,6 +242,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
                 'envs', environment.id)
             environment.command = `cd ${environment.runnerWorkingFolder} && \
 ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \
+1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \
 && echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`;
             return Promise.resolve(true);
         }

diff --git a/test/config/training_service.yml b/test/config/training_service.yml
@@ -95,6 +95,8 @@ pai:
     containerNFSMountPath: 
     paiStorageConfigName: 
 remote:
+  remoteConfig:
+    reuse: false
   machineList:
   - ip:
     passwd:

diff --git a/test/nni_test/nnitest/generate_ts_config.py b/test/nni_test/nnitest/generate_ts_config.py
@@ -86,6 +86,8 @@ def update_training_service_config(args):
             config[args.ts]['machineList'][0]['port'] = args.remote_port
         if args.remote_pwd is not None:
             config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd
+        if args.remote_reuse is not None:
+            config[args.ts]['remoteConfig']['reuse'] = args.remote_reuse.lower() == 'true'
 
     dump_yml_content(TRAINING_SERVICE_FILE, config)
 
@@ -119,6 +121,7 @@ def update_training_service_config(args):
     parser.add_argument("--remote_pwd", type=str)
     parser.add_argument("--remote_host", type=str)
     parser.add_argument("--remote_port", type=int)
+    parser.add_argument("--remote_reuse", type=str)
     args = parser.parse_args()
 
     update_training_service_config(args)
diff --git a/test/pipelines/pipelines-it-remote-linux-to-linux.yml b/test/pipelines/pipelines-it-remote-linux-to-linux.yml
@@ -62,7 +62,7 @@ jobs:
   - script: |
       set -e
       cd test
-      python3 nni_test/nnitest/generate_ts_config.py --ts remote --remote_user $(docker_user) --remote_host $(remote_host) \
+      python3 nni_test/nnitest/generate_ts_config.py --ts remote --remote_reuse $(remote_reuse) --remote_user $(docker_user) --remote_host $(remote_host) \
       --remote_port $(cat port) --remote_pwd $(docker_pwd) --nni_manager_ip $(nni_manager_ip)
       cat config/training_service.yml
       PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote

diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py
@@ -137,10 +137,13 @@ def is_running(self):
     def kill(self, trial_id=None):
         if trial_id == self.id or trial_id is None:
             if self.process is not None:
-                nni_log(LogType.Info, "%s: killing trial" % self.name)
-                for child in psutil.Process(self.process.pid).children(True):
-                    child.kill()
-                self.process.kill()
+                try:
+                    nni_log(LogType.Info, "%s: killing trial" % self.name)
+                    for child in psutil.Process(self.process.pid).children(True):
+                        child.kill()
+                    self.process.kill()
+                except Exception as ex:
+                    nni_log(LogType.Error, "kill trial %s failed: %s " % (trial_id, str(ex)))
             self.cleanup()
 
     def cleanup(self):