From 80bc9537b6952a879e3a7805996cb3c862a29ee8 Mon Sep 17 00:00:00 2001 From: Ni Hao Date: Wed, 7 Apr 2021 19:30:02 +0800 Subject: [PATCH] add reusable for win remote (#3500) --- .../common/containerJobData.ts | 7 ++++ .../remote_machine/shellExecutor.ts | 3 +- .../environments/remoteEnvironmentService.ts | 35 +++++++++++++++---- .../reusable/trialDispatcher.ts | 6 ++-- 4 files changed, 41 insertions(+), 10 deletions(-) diff --git a/ts/nni_manager/training_service/common/containerJobData.ts b/ts/nni_manager/training_service/common/containerJobData.ts index 05f2178dcd..073859ee83 100644 --- a/ts/nni_manager/training_service/common/containerJobData.ts +++ b/ts/nni_manager/training_service/common/containerJobData.ts @@ -12,3 +12,10 @@ else # Install nni python3 -m pip install --user --upgrade nni fi`; + +export const CONTAINER_INSTALL_NNI_SHELL_FORMAT_FOR_WIN: string = +`python -c "import nni" 2>$error +if ($error -ne ''){ + python -m pip install --user --upgrade nni +} +exit`; \ No newline at end of file diff --git a/ts/nni_manager/training_service/remote_machine/shellExecutor.ts b/ts/nni_manager/training_service/remote_machine/shellExecutor.ts index 2cb279f39e..14b9af7fdb 100644 --- a/ts/nni_manager/training_service/remote_machine/shellExecutor.ts +++ b/ts/nni_manager/training_service/remote_machine/shellExecutor.ts @@ -30,10 +30,11 @@ class ShellExecutor { private readonly sshClient: Client; private readonly log: Logger; private tempPath: string = ""; - private isWindows: boolean = false; private channelDefaultOutputs: string[] = []; private pythonPath: string | undefined; + public isWindows: boolean = false; + constructor() { this.log = getLogger(); this.sshClient = new Client(); diff --git a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts index fd6b3a8308..e816775ac8 100644 --- a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts @@ -10,7 +10,7 @@ import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../../common/log'; import { EnvironmentInformation, EnvironmentService } from '../environment'; import { - getExperimentRootDir, + getExperimentRootDir, getLogLevel } from '../../../common/utils'; import { TrialConfig } from '../../common/trialConfig'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; @@ -218,6 +218,30 @@ export class RemoteEnvironmentService extends EnvironmentService { this.remoteMachineMetaOccupiedMap.set(remoteEnvironment.rmMachineMeta, false); } + private async getScript(environment: EnvironmentInformation): Promise { + const executor = await this.getExecutor(environment.id); + const isDebug = getLogLevel() == "debug"; + let script: string = environment.command; + environment.runnerWorkingFolder = executor.joinPath(this.remoteExperimentRootDir, 'envs', environment.id); + + let codeScript = `echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; + if (executor.isWindows) { + const prepare = `mkdir envs\\${environment.id} 2>NUL & cd envs\\${environment.id}`; + const startrun = `powershell ..\\install_nni.ps1 && python -m nni.tools.trial_tool.trial_runner`; + const developingScript = "IF EXIST nni_trial_tool (ECHO \"nni_trial_tool exists already\") ELSE (mkdir nni_trial_tool && tar -xof ../nni_trial_tool.tar.gz -C ./nni_trial_tool) && pip3 install websockets"; + + script = isDebug ? `${prepare} && ${developingScript} && ${startrun}` : `${prepare} && ${startrun}`; + codeScript = `powershell -command "Write $? " " (((New-TimeSpan -Start (Get-Date "01/01/1970") -End (Get-Date).ToUniversalTime()).TotalMilliseconds).ToString("0")) | Out-file ${path.join(environment.runnerWorkingFolder, 'code')} -Append -NoNewline -encoding utf8"`; + } + + script = `cd ${this.remoteExperimentRootDir} && \ + ${script} --job_pid_file ${environment.runnerWorkingFolder}/pid \ + 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ + && ${codeScript}`; + + return script; + } + public async startEnvironment(environment: EnvironmentInformation): Promise { if (this.sshConnectionPromises.length > 0) { await Promise.all(this.sshConnectionPromises); @@ -268,11 +292,8 @@ export class RemoteEnvironmentService extends EnvironmentService { } else { this.remoteExperimentRootDir = executor.getRemoteExperimentRootDir(getExperimentId()); } - environment.runnerWorkingFolder = executor.joinPath(this.remoteExperimentRootDir, 'envs', environment.id); - environment.command = `cd ${this.remoteExperimentRootDir} && \ - ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ - 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ - && echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; + + environment.command = await this.getScript(environment); return Promise.resolve(true); } } @@ -291,7 +312,7 @@ export class RemoteEnvironmentService extends EnvironmentService { // Copy files in codeDir to remote working directory await executor.copyDirectoryToRemote(environmentLocalTempFolder, this.remoteExperimentRootDir); // Execute command in remote machine, set isInteractive=true to run script in conda environment - executor.executeScript(executor.joinPath(environment.runnerWorkingFolder, + executor.executeScript(executor.joinPath(this.remoteExperimentRootDir, executor.getScriptName("run")), true, true); if (environment.rmMachineMeta === undefined) { throw new Error(`${environment.id} rmMachineMeta not initialized!`); diff --git a/ts/nni_manager/training_service/reusable/trialDispatcher.ts b/ts/nni_manager/training_service/reusable/trialDispatcher.ts index ad32fcf9cb..85cd5f0981 100644 --- a/ts/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/ts/nni_manager/training_service/reusable/trialDispatcher.ts @@ -18,6 +18,7 @@ import { delay, getExperimentRootDir, getIPV4Address, getLogLevel, getVersion, m import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END, VERSION_CHECK } from '../../core/commands'; import { ScheduleResultType } from '../../training_service/common/gpuData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; +import { CONTAINER_INSTALL_NNI_SHELL_FORMAT_FOR_WIN } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { validateCodeDir } from '../common/util'; @@ -32,7 +33,6 @@ import { NFSSharedStorageService } from './shared_storages/nfsStorageService' import { AzureBlobSharedStorageService } from './shared_storages/azureblobStorageService' import { TrialDetail } from './trial'; - /** * It uses to manage jobs on training platforms * and expose trial as trial job to upper level. @@ -225,8 +225,10 @@ class TrialDispatcher implements TrainingService { const codeFileName = await storageService.copyDirectory(codeDir, envDir, true); storageService.rename(codeFileName, "nni-code.tar.gz"); - const installFileName = storageService.joinPath(envDir, 'install_nni.sh'); + const installFileName = storageService.joinPath(envDir, `install_nni.sh`); + const installFileNameForWin = storageService.joinPath(envDir, `install_nni.ps1`); await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT, installFileName); + await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT_FOR_WIN, installFileNameForWin); const runnerSettingsConfig = storageService.joinPath(envDir, "settings.json"); await storageService.save(JSON.stringify(runnerSettings), runnerSettingsConfig);