Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
add reusable for win remote (#3500)
Browse files Browse the repository at this point in the history
  • Loading branch information
acured authored Apr 7, 2021
1 parent 76f3990 commit 80bc953
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 10 deletions.
7 changes: 7 additions & 0 deletions ts/nni_manager/training_service/common/containerJobData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,10 @@ else
# Install nni
python3 -m pip install --user --upgrade nni
fi`;

export const CONTAINER_INSTALL_NNI_SHELL_FORMAT_FOR_WIN: string =
`python -c "import nni" 2>$error
if ($error -ne ''){
python -m pip install --user --upgrade nni
}
exit`;
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ class ShellExecutor {
private readonly sshClient: Client;
private readonly log: Logger;
private tempPath: string = "";
private isWindows: boolean = false;
private channelDefaultOutputs: string[] = [];
private pythonPath: string | undefined;

public isWindows: boolean = false;

constructor() {
this.log = getLogger();
this.sshClient = new Client();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { getExperimentId } from '../../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../../common/log';
import { EnvironmentInformation, EnvironmentService } from '../environment';
import {
getExperimentRootDir,
getExperimentRootDir, getLogLevel
} from '../../../common/utils';
import { TrialConfig } from '../../common/trialConfig';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
Expand Down Expand Up @@ -218,6 +218,30 @@ export class RemoteEnvironmentService extends EnvironmentService {
this.remoteMachineMetaOccupiedMap.set(remoteEnvironment.rmMachineMeta, false);
}

private async getScript(environment: EnvironmentInformation): Promise<string> {
const executor = await this.getExecutor(environment.id);
const isDebug = getLogLevel() == "debug";
let script: string = environment.command;
environment.runnerWorkingFolder = executor.joinPath(this.remoteExperimentRootDir, 'envs', environment.id);

let codeScript = `echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`;
if (executor.isWindows) {
const prepare = `mkdir envs\\${environment.id} 2>NUL & cd envs\\${environment.id}`;
const startrun = `powershell ..\\install_nni.ps1 && python -m nni.tools.trial_tool.trial_runner`;
const developingScript = "IF EXIST nni_trial_tool (ECHO \"nni_trial_tool exists already\") ELSE (mkdir nni_trial_tool && tar -xof ../nni_trial_tool.tar.gz -C ./nni_trial_tool) && pip3 install websockets";

script = isDebug ? `${prepare} && ${developingScript} && ${startrun}` : `${prepare} && ${startrun}`;
codeScript = `powershell -command "Write $? " " (((New-TimeSpan -Start (Get-Date "01/01/1970") -End (Get-Date).ToUniversalTime()).TotalMilliseconds).ToString("0")) | Out-file ${path.join(environment.runnerWorkingFolder, 'code')} -Append -NoNewline -encoding utf8"`;
}

script = `cd ${this.remoteExperimentRootDir} && \
${script} --job_pid_file ${environment.runnerWorkingFolder}/pid \
1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \
&& ${codeScript}`;

return script;
}

public async startEnvironment(environment: EnvironmentInformation): Promise<void> {
if (this.sshConnectionPromises.length > 0) {
await Promise.all(this.sshConnectionPromises);
Expand Down Expand Up @@ -268,11 +292,8 @@ export class RemoteEnvironmentService extends EnvironmentService {
} else {
this.remoteExperimentRootDir = executor.getRemoteExperimentRootDir(getExperimentId());
}
environment.runnerWorkingFolder = executor.joinPath(this.remoteExperimentRootDir, 'envs', environment.id);
environment.command = `cd ${this.remoteExperimentRootDir} && \
${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \
1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \
&& echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`;

environment.command = await this.getScript(environment);
return Promise.resolve(true);
}
}
Expand All @@ -291,7 +312,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
// Copy files in codeDir to remote working directory
await executor.copyDirectoryToRemote(environmentLocalTempFolder, this.remoteExperimentRootDir);
// Execute command in remote machine, set isInteractive=true to run script in conda environment
executor.executeScript(executor.joinPath(environment.runnerWorkingFolder,
executor.executeScript(executor.joinPath(this.remoteExperimentRootDir,
executor.getScriptName("run")), true, true);
if (environment.rmMachineMeta === undefined) {
throw new Error(`${environment.id} rmMachineMeta not initialized!`);
Expand Down
6 changes: 4 additions & 2 deletions ts/nni_manager/training_service/reusable/trialDispatcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import { delay, getExperimentRootDir, getIPV4Address, getLogLevel, getVersion, m
import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END, VERSION_CHECK } from '../../core/commands';
import { ScheduleResultType } from '../../training_service/common/gpuData';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT_FOR_WIN } from '../common/containerJobData';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { validateCodeDir } from '../common/util';
Expand All @@ -32,7 +33,6 @@ import { NFSSharedStorageService } from './shared_storages/nfsStorageService'
import { AzureBlobSharedStorageService } from './shared_storages/azureblobStorageService'
import { TrialDetail } from './trial';


/**
* It uses to manage jobs on training platforms
* and expose trial as trial job to upper level.
Expand Down Expand Up @@ -225,8 +225,10 @@ class TrialDispatcher implements TrainingService {
const codeFileName = await storageService.copyDirectory(codeDir, envDir, true);
storageService.rename(codeFileName, "nni-code.tar.gz");

const installFileName = storageService.joinPath(envDir, 'install_nni.sh');
const installFileName = storageService.joinPath(envDir, `install_nni.sh`);
const installFileNameForWin = storageService.joinPath(envDir, `install_nni.ps1`);
await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT, installFileName);
await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT_FOR_WIN, installFileNameForWin);

const runnerSettingsConfig = storageService.joinPath(envDir, "settings.json");
await storageService.save(JSON.stringify(runnerSettings), runnerSettingsConfig);
Expand Down

0 comments on commit 80bc953

Please sign in to comment.