Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Refactor code storage logic for trial #2403

Merged
merged 62 commits into from
May 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
704b50e
Merge pull request #200 from microsoft/master
SparkSnail Aug 6, 2019
5b0034e
Merge pull request #204 from microsoft/master
SparkSnail Aug 20, 2019
8fe2588
Merge pull request #205 from microsoft/master
SparkSnail Aug 30, 2019
9fae194
Merge pull request #206 from microsoft/master
SparkSnail Sep 16, 2019
c785655
Merge pull request #207 from microsoft/master
SparkSnail Oct 21, 2019
2f5272c
Merge pull request #208 from microsoft/master
SparkSnail Oct 24, 2019
1892bc2
Merge pull request #209 from microsoft/master
SparkSnail Oct 28, 2019
7c1ab11
Merge pull request #210 from microsoft/master
SparkSnail Oct 28, 2019
8c203f3
Merge pull request #211 from microsoft/master
SparkSnail Oct 31, 2019
d7a62f6
check pylint for nni_cmd
SparkSnail Oct 31, 2019
e259d10
fix id error
SparkSnail Oct 31, 2019
4997295
Merge pull request #212 from microsoft/master
SparkSnail Nov 3, 2019
c037a7c
Merge pull request #213 from microsoft/master
SparkSnail Nov 10, 2019
7620e7c
Merge pull request #214 from microsoft/master
SparkSnail Nov 14, 2019
d16dbe9
Merge pull request #215 from microsoft/master
SparkSnail Nov 19, 2019
9ce751d
Merge pull request #216 from microsoft/master
SparkSnail Nov 21, 2019
a0846f2
Merge pull request #217 from microsoft/master
SparkSnail Nov 22, 2019
cd3a912
Merge pull request #218 from microsoft/master
SparkSnail Nov 27, 2019
32efaa3
Merge pull request #219 from microsoft/master
SparkSnail Dec 10, 2019
543239c
Merge pull request #220 from microsoft/master
SparkSnail Dec 12, 2019
36e6e35
Merge pull request #221 from microsoft/master
SparkSnail Dec 19, 2019
f9ee589
Merge pull request #222 from microsoft/master
SparkSnail Dec 24, 2019
b9a7a95
Merge pull request #223 from microsoft/master
SparkSnail Dec 25, 2019
1a5c017
Merge pull request #224 from microsoft/master
SparkSnail Jan 6, 2020
392460a
Merge pull request #225 from microsoft/master
SparkSnail Jan 8, 2020
9bafa4c
Merge pull request #226 from microsoft/master
SparkSnail Jan 8, 2020
c23b807
Merge pull request #227 from microsoft/master
SparkSnail Jan 10, 2020
4132f62
Merge pull request #228 from microsoft/master
SparkSnail Jan 10, 2020
4f66d0c
Merge pull request #229 from microsoft/master
SparkSnail Feb 1, 2020
129c4a5
Merge pull request #230 from microsoft/master
SparkSnail Feb 4, 2020
3fe117f
Merge pull request #231 from microsoft/master
SparkSnail Feb 7, 2020
aa31674
Merge pull request #233 from microsoft/master
SparkSnail Feb 21, 2020
1d74ae5
Merge pull request #234 from microsoft/master
SparkSnail Feb 27, 2020
75028bd
Merge pull request #235 from microsoft/master
SparkSnail Mar 17, 2020
4773c91
Merge pull request #236 from microsoft/master
SparkSnail Mar 18, 2020
3ee0961
Merge pull request #237 from microsoft/master
SparkSnail Mar 20, 2020
0fb7862
Merge pull request #238 from microsoft/master
SparkSnail Mar 26, 2020
6c3148c
Merge pull request #239 from microsoft/master
SparkSnail Apr 3, 2020
b4773e1
Merge pull request #240 from microsoft/master
SparkSnail Apr 11, 2020
6728799
Merge pull request #241 from microsoft/master
SparkSnail Apr 16, 2020
1b9daa3
Merge pull request #242 from microsoft/master
SparkSnail Apr 20, 2020
e0c2c0e
Merge pull request #243 from microsoft/master
SparkSnail Apr 23, 2020
e29b58a
Merge pull request #244 from microsoft/master
SparkSnail Apr 30, 2020
22b25c5
init
SparkSnail May 3, 2020
22d4780
Merge branch 'master' of https://github.com/SparkSnail/nni into dev-s…
SparkSnail May 5, 2020
83d6d1c
support kubeflow and frameworkcontroller
SparkSnail May 5, 2020
3dfc4bc
format annotation
SparkSnail May 5, 2020
88cb348
fix eslint
SparkSnail May 6, 2020
1e51182
Merge pull request #245 from microsoft/master
SparkSnail May 8, 2020
d5ccfda
add doc
SparkSnail May 8, 2020
a3d7f35
fix logic according to discussion
SparkSnail May 12, 2020
d90433d
Merge pull request #246 from microsoft/master
SparkSnail May 12, 2020
1455e4e
fix conflict
SparkSnail May 12, 2020
57b7767
fix comments
SparkSnail May 12, 2020
c3c721f
fix eslint
SparkSnail May 13, 2020
e786ff8
print nniManager.log when trial fails
SparkSnail May 13, 2020
fe6e848
fix comments
SparkSnail May 15, 2020
62de928
fix windows scripts
SparkSnail May 15, 2020
6568eae
Merge pull request #247 from microsoft/master
SparkSnail May 18, 2020
58922c7
fix comments
SparkSnail May 19, 2020
0fd38de
Merge pull request #248 from microsoft/master
SparkSnail May 19, 2020
0c1e5d0
Merge branch 'master' of https://github.com/SparkSnail/nni into dev-s…
SparkSnail May 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

'use strict';

import * as assert from 'assert';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
Expand Down Expand Up @@ -72,6 +73,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this.kubernetesRestServerPort = restServer.clusterRestServerPort;
}

// wait upload of code Dir to finish
if (this.copyExpCodeDirPromise !== undefined) {
await this.copyExpCodeDirPromise;
}

const trialJobId: string = uniqueString(5);
// Set trial's NFS working folder
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
Expand All @@ -81,8 +87,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this.generateContainerPort();
await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, form);

//upload code files
const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
//wait upload of script files to finish
const trialJobOutputUrl: string = await this.uploadFolder(trialLocalTempFolder, `nni/${getExperimentId()}/${trialJobId}`);
let initStatus: TrialJobStatus = 'WAITING';
if (!trialJobOutputUrl) {
initStatus = 'FAILED';
Expand Down Expand Up @@ -151,6 +157,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.fcTrialConfig.codeDir);
//upload codeDir to storage
this.copyExpCodeDirPromise = this.uploadFolder(this.fcTrialConfig.codeDir, `nni/${getExperimentId()}/nni-code`);
} catch (error) {
this.log.error(error);

Expand All @@ -171,41 +179,31 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}

/**
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
* upload local folder to nfs or azureStroage
*/
private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> {
private async uploadFolder(srcDirectory: string, destDirectory: string): Promise<string> {
if (this.fcClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized');
}

if (this.fcTrialConfig === undefined) {
throw new Error('Kubeflow trial config is not initialized');
}

let trialJobOutputUrl: string = '';
assert(this.fcClusterConfig.storage === undefined
|| this.fcClusterConfig.storage === 'azureStorage'
|| this.fcClusterConfig.storage === 'nfs');

if (this.fcClusterConfig.storageType === 'azureStorage') {
const azureFrameworkControllerClusterConfig: FrameworkControllerClusterConfigAzure =
<FrameworkControllerClusterConfigAzure>this.fcClusterConfig;
trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.fcTrialConfig.codeDir,
azureFrameworkControllerClusterConfig.uploadRetryCount);
} else if (this.fcClusterConfig.storageType === 'nfs') {
const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS =
<FrameworkControllerClusterConfigNFS>this.fcClusterConfig;
// Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
// Copy codeDir to NFS mounted dir
await cpp.exec(`cp -r ${this.fcTrialConfig.codeDir}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`;
if (this.fcClusterConfig.storage === 'azureStorage') {
if (this.azureStorageClient === undefined) {
throw new Error('azureStorageClient is not initialized');
}
const fcClusterConfigAzure: FrameworkControllerClusterConfigAzure = <FrameworkControllerClusterConfigAzure>this.fcClusterConfig;
return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, fcClusterConfigAzure.uploadRetryCount);
} else if (this.fcClusterConfig.storage === 'nfs' || this.fcClusterConfig.storage === undefined) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or the storage is undefined? This should be an error?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In original design, the storage field is optional, if it is not set, the default value is nfs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but when storage is undefined, there is no config for storage, how do you mount it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is supposed to be check in NNICTL side, if users didn't set storage field, nnictl should check if usres set nfs configuration.

await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/${destDirectory}`);
await cpp.exec(`cp -r ${srcDirectory}/* ${this.trialLocalNFSTempFolder}/${destDirectory}/.`);
const fcClusterConfigNFS: FrameworkControllerClusterConfigNFS = <FrameworkControllerClusterConfigNFS>this.fcClusterConfig;
const nfsConfig: NFSConfig = fcClusterConfigNFS.nfs;
return `nfs://${nfsConfig.server}:${destDirectory}`;
}

return Promise.resolve(trialJobOutputUrl);
return '';
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,20 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
this.kubernetesRestServerPort = restServer.clusterRestServerPort;
}

// upload code Dir to storage
if (this.copyExpCodeDirPromise !== undefined) {
await this.copyExpCodeDirPromise;
}

const trialJobId: string = uniqueString(5);
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const kubeflowJobName: string = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase();
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//prepare the runscript
await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, form);
//upload files to sotrage
const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
//upload script files to sotrage
const trialJobOutputUrl: string = await this.uploadFolder(trialLocalTempFolder, `nni/${getExperimentId()}/${trialJobId}`);
let initStatus: TrialJobStatus = 'WAITING';
if (!trialJobOutputUrl) {
initStatus = 'FAILED';
Expand Down Expand Up @@ -152,6 +158,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.kubeflowTrialConfig.codeDir);
//upload codeDir to storage
this.copyExpCodeDirPromise = this.uploadFolder(this.kubeflowTrialConfig.codeDir, `nni/${getExperimentId()}/nni-code`);
} catch (error) {
this.log.error(error);

Expand All @@ -172,12 +180,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}

/**
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
* upload local folder to nfs or azureStroage
*/
private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> {
private async uploadFolder(srcDirectory: string, destDirectory: string): Promise<string> {
if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized');
}
Expand All @@ -186,8 +191,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw new Error('Kubeflow Trial config is not initialized');
}

let trialJobOutputUrl: string = '';

assert(this.kubeflowClusterConfig.storage === undefined
|| this.kubeflowClusterConfig.storage === 'azureStorage'
|| this.kubeflowClusterConfig.storage === 'nfs');
Expand All @@ -197,20 +200,15 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw new Error('azureStorageClient is not initialized');
}
const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>this.kubeflowClusterConfig;
trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.kubeflowTrialConfig.codeDir, azureKubeflowClusterConfig.uploadRetryCount);
return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, azureKubeflowClusterConfig.uploadRetryCount);
} else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) {
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/${destDirectory}`);
await cpp.exec(`cp -r ${srcDirectory}/* ${this.trialLocalNFSTempFolder}/${destDirectory}/.`);
const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
// Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy script files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
// Copy codeDir to NFS mounted dir
await cpp.exec(`cp -r ${this.kubeflowTrialConfig.codeDir}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`;
return `nfs://${nfsConfig.server}:${destDirectory}`;
}

return Promise.resolve(trialJobOutputUrl);
return '';
}

private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string, trialWorkingFolder: string,
Expand Down
4 changes: 2 additions & 2 deletions src/nni_manager/training_service/kubernetes/kubernetesData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
export const kubernetesScriptFormat: string =
`#!/bin/bash
export NNI_PLATFORM={0}
export NNI_SYS_DIR=$PWD/nni/{1}
export NNI_SYS_DIR={1}
export NNI_OUTPUT_DIR={2}
export MULTI_PHASE=false
export NNI_TRIAL_JOB_ID={3}
Expand All @@ -49,7 +49,7 @@ export NNI_TRIAL_SEQ_ID={6}
{7}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ abstract class KubernetesTrainingService {
protected kubernetesClusterConfig?: KubernetesClusterConfig;
protected versionCheck: boolean = true;
protected logCollection: string;
protected copyExpCodeDirPromise?: Promise<string>;
protected expContainerCodeFolder: string;

constructor() {
this.log = getLogger();
Expand All @@ -57,6 +59,7 @@ abstract class KubernetesTrainingService {
this.trialLocalNFSTempFolder = path.join(getExperimentRootDir(), 'trials-nfs-tmp');
this.experimentId = getExperimentId();
this.CONTAINER_MOUNT_PATH = '/tmp/mount';
this.expContainerCodeFolder = path.join(this.CONTAINER_MOUNT_PATH, 'nni', this.experimentId, 'nni-code');
this.genericK8sClient = new GeneralK8sClient();
this.logCollection = 'none';
}
Expand Down Expand Up @@ -272,11 +275,11 @@ abstract class KubernetesTrainingService {
const runScript: string = String.Format(
kubernetesScriptFormat,
platform,
trialJobId,
trialWorkingFolder,
path.join(trialWorkingFolder, 'output', `${roleName}_output`),
trialJobId,
getExperimentId(),
trialWorkingFolder,
this.expContainerCodeFolder,
trialSequenceId,
nvidiaScript,
command,
Expand Down Expand Up @@ -329,51 +332,45 @@ abstract class KubernetesTrainingService {
);
return registrySecretName;
}

protected async uploadFilesToAzureStorage(trialJobId: string, trialLocalTempFolder: string, codeDir: string, uploadRetryCount: number | undefined): Promise<string> {

/**
* upload local directory to azureStorage
* @param srcDirectory the source directory of local folder
* @param destDirectory the target directory in azure
* @param uploadRetryCount the retry time when upload failed
*/
protected async uploadFolderToAzureStorage(srcDirectory: string, destDirectory: string, uploadRetryCount: number | undefined): Promise<string> {
if (this.azureStorageClient === undefined) {
throw new Error('azureStorageClient is not initialized');
}
let trialJobOutputUrl: string = '';
let retryCount: number = 1;
if(uploadRetryCount) {
retryCount = uploadRetryCount;
}
let resultUploadNNIScript: boolean = false;
let resultUploadCodeFile: boolean = false;
let uploadSuccess: boolean = false;
let folderUriInAzure = '';
try {
do {
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
if(!resultUploadNNIScript) {
resultUploadNNIScript = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${trialLocalTempFolder}`);
}
//upload code files to azure storage
if(!resultUploadCodeFile) {
resultUploadCodeFile = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${codeDir}`);
}
if (resultUploadNNIScript && resultUploadCodeFile) {
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` +
`/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
break;
} else {
uploadSuccess = await AzureStorageClientUtility.uploadDirectory(
this.azureStorageClient,
`${destDirectory}`,
this.azureStorageShare,
`${srcDirectory}`);
if (!uploadSuccess) {
//wait for 5 seconds to re-upload files
await delay(5000);
this.log.info('Upload failed, Retry: upload files to azure-storage');
} else {
folderUriInAzure = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${destDirectory}`;
break;
}
} while (retryCount-- >= 0)
} catch (error) {
this.log.error(error);
//return a empty url when got error
return Promise.resolve("");
}
if(!trialJobOutputUrl) {
this.log.info(`Retry-count is used up, upload files to azureStorage for trial ${trialJobId} failed!`);
return Promise.resolve('');
}
return Promise.resolve(trialJobOutputUrl);
return Promise.resolve(folderUriInAzure);
}

}
Expand Down
39 changes: 23 additions & 16 deletions src/nni_manager/training_service/local/localTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -361,21 +361,25 @@ class LocalTrainingService implements TrainingService {
trialJobDetail: TrialJobDetail,
resource: { gpuIndices: number[] },
gpuNum: number | undefined): { key: string; value: string }[] {
const envVariables: { key: string; value: string }[] = [
{ key: 'NNI_PLATFORM', value: 'local' },
{ key: 'NNI_EXP_ID', value: this.experimentId },
{ key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory },
{ key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id },
{ key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory },
{ key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.form.sequenceId.toString() },
{ key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
];
if (gpuNum !== undefined) {
envVariables.push({
key: 'CUDA_VISIBLE_DEVICES',
value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
});
}
if (this.localTrialConfig === undefined) {
throw new Error('localTrialConfig is not initialized!');
}
const envVariables: { key: string; value: string }[] = [
{ key: 'NNI_PLATFORM', value: 'local' },
{ key: 'NNI_EXP_ID', value: this.experimentId },
{ key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory },
{ key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id },
{ key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory },
{ key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.form.sequenceId.toString() },
{ key: 'MULTI_PHASE', value: this.isMultiPhase.toString() },
{ key: 'NNI_CODE_DIR', value: this.localTrialConfig.codeDir}
];
if (gpuNum !== undefined) {
envVariables.push({
key: 'CUDA_VISIBLE_DEVICES',
value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
});
}

return envVariables;
}
Expand Down Expand Up @@ -473,12 +477,16 @@ class LocalTrainingService implements TrainingService {
private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
const script: string[] = [];
if (process.platform === 'win32') {
script.push(`Copy-Item $env:NNI_CODE_DIR\\* -Destination $env:NNI_SYS_DIR -Recurse`);
script.push(`cd $env:NNI_SYS_DIR`);
script.push(
`cmd.exe /c ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File "${path.join(workingDirectory, '.nni', 'state')}" -NoNewline -encoding utf8`);
} else {
script.push(`cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR`);
script.push(`cd $NNI_SYS_DIR`);
script.push(`eval ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`);
if (process.platform === 'darwin') {
// https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x
Expand Down Expand Up @@ -506,7 +514,6 @@ class LocalTrainingService implements TrainingService {
if (process.platform !== 'win32') {
runScriptContent.push('#!/bin/bash');
}
runScriptContent.push(`cd '${this.localTrialConfig.codeDir}'`);
for (const variable of variables) {
runScriptContent.push(setEnvironmentVariable(variable));
}
Expand Down
Loading