Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Do not copy codeDir when submit trial in Kubeflow and Frameworkcontroller mode #1309

Merged
merged 93 commits into from
Jul 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
93 commits
Select commit Hold shift + click to select a range
d77a99c
fix remote bug
Dec 25, 2018
695d866
Merge pull request #106 from Microsoft/master
SparkSnail Dec 25, 2018
b7e9799
Merge pull request #107 from Microsoft/master
SparkSnail Dec 27, 2018
7cb03f9
add document
Dec 27, 2018
44d1565
add document
Dec 27, 2018
7ab7386
update
Dec 27, 2018
d9e1ea8
update
Dec 27, 2018
2c225a8
update
Dec 27, 2018
be23f55
update
Dec 29, 2018
6f760ab
Merge pull request #108 from Microsoft/master
SparkSnail Jan 2, 2019
9161209
fix remote issue
Jan 3, 2019
e661c55
fix forEach
Jan 3, 2019
4e5d836
Merge pull request #109 from Microsoft/master
SparkSnail Jan 3, 2019
f80e737
fix conflict
Jan 4, 2019
aefc219
Merge branch 'Microsoft-master'
Jan 4, 2019
4fec2cc
update doc according to comments
Jan 7, 2019
dc45661
Merge pull request #111 from Microsoft/master
SparkSnail Jan 7, 2019
11fec6f
update
Jan 7, 2019
a03a191
update
Jan 7, 2019
7c7832c
update
Jan 7, 2019
2c862dc
Merge pull request #112 from Microsoft/master
SparkSnail Jan 8, 2019
85c015d
remove 'any more'
Jan 8, 2019
85cb472
Merge branch 'master' of https://github.com/SparkSnail/nni
Jan 8, 2019
3784355
Merge pull request #113 from Microsoft/master
SparkSnail Jan 9, 2019
d91c980
Merge pull request #114 from Microsoft/master
SparkSnail Jan 14, 2019
9786650
Merge pull request #115 from Microsoft/master
SparkSnail Jan 17, 2019
ef176d2
Merge pull request #116 from Microsoft/master
SparkSnail Jan 22, 2019
1089e80
Merge pull request #117 from Microsoft/master
SparkSnail Jan 23, 2019
627e823
Merge pull request #119 from Microsoft/master
SparkSnail Jan 24, 2019
b633c26
Merge pull request #120 from Microsoft/master
SparkSnail Jan 25, 2019
035d58b
Merge pull request #121 from Microsoft/master
SparkSnail Feb 11, 2019
cd549df
Merge pull request #122 from Microsoft/master
SparkSnail Feb 12, 2019
964743a
Merge pull request #123 from Microsoft/master
SparkSnail Feb 12, 2019
8422992
Merge pull request #124 from Microsoft/master
SparkSnail Feb 13, 2019
40391ec
Merge pull request #125 from Microsoft/master
SparkSnail Feb 18, 2019
1d84526
Merge pull request #126 from Microsoft/master
SparkSnail Feb 20, 2019
1852457
Merge pull request #127 from Microsoft/master
SparkSnail Feb 23, 2019
754a354
Merge pull request #128 from Microsoft/master
SparkSnail Feb 24, 2019
1ee9735
Merge pull request #129 from Microsoft/master
SparkSnail Feb 25, 2019
9f4485c
Merge pull request #130 from Microsoft/master
SparkSnail Feb 25, 2019
b1c3774
Merge pull request #131 from Microsoft/master
SparkSnail Feb 25, 2019
5d7923e
Merge pull request #132 from Microsoft/master
SparkSnail Feb 25, 2019
281f3dc
Merge pull request #133 from Microsoft/master
SparkSnail Feb 26, 2019
2ce9157
Merge pull request #134 from Microsoft/master
SparkSnail Feb 26, 2019
571a7af
Merge pull request #135 from Microsoft/master
SparkSnail Feb 28, 2019
f09d51a
Merge pull request #136 from Microsoft/master
SparkSnail Mar 1, 2019
41a9a59
Merge pull request #137 from Microsoft/master
SparkSnail Mar 5, 2019
21165b5
Merge pull request #138 from Microsoft/master
SparkSnail Mar 7, 2019
d25f7b5
Merge pull request #139 from Microsoft/master
SparkSnail Mar 11, 2019
17e719e
Merge pull request #140 from Microsoft/master
SparkSnail Mar 12, 2019
e25ffbd
Merge pull request #141 from Microsoft/master
SparkSnail Mar 13, 2019
5e777d2
Merge pull request #142 from Microsoft/master
SparkSnail Mar 14, 2019
6ff24a5
Merge pull request #143 from Microsoft/master
SparkSnail Mar 18, 2019
ccf6c04
Merge pull request #144 from Microsoft/master
SparkSnail Mar 20, 2019
eb5e21c
Merge pull request #145 from Microsoft/master
SparkSnail Mar 20, 2019
f796c60
Merge pull request #146 from Microsoft/master
SparkSnail Mar 21, 2019
e1ae623
Merge pull request #147 from Microsoft/master
SparkSnail Mar 22, 2019
ec41d56
Merge pull request #148 from Microsoft/master
SparkSnail Mar 25, 2019
080ae00
Merge pull request #149 from Microsoft/master
SparkSnail Mar 26, 2019
f0a2d39
Merge pull request #150 from Microsoft/master
SparkSnail Mar 26, 2019
77526d3
Merge pull request #152 from Microsoft/master
SparkSnail Apr 1, 2019
d95c351
Merge pull request #155 from Microsoft/master
SparkSnail Apr 3, 2019
346d49d
Merge pull request #156 from Microsoft/master
SparkSnail Apr 11, 2019
6af4b86
Merge pull request #158 from Microsoft/master
SparkSnail Apr 12, 2019
cf5336d
Merge pull request #159 from Microsoft/master
SparkSnail Apr 15, 2019
aec4977
Merge pull request #160 from Microsoft/master
SparkSnail Apr 16, 2019
b1dfaff
Merge pull request #161 from Microsoft/master
SparkSnail Apr 18, 2019
6c9360a
Merge pull request #162 from Microsoft/master
SparkSnail Apr 19, 2019
0663218
Merge pull request #163 from Microsoft/master
SparkSnail Apr 22, 2019
5187b2c
Merge pull request #164 from Microsoft/master
SparkSnail Apr 22, 2019
5032694
Merge pull request #165 from Microsoft/master
SparkSnail Apr 23, 2019
c577553
Merge pull request #166 from Microsoft/master
SparkSnail May 5, 2019
93d6502
Merge pull request #167 from Microsoft/master
SparkSnail May 6, 2019
b5eab4b
Merge pull request #168 from microsoft/master
SparkSnail May 10, 2019
f39d69e
Merge pull request #169 from microsoft/master
SparkSnail May 14, 2019
a030505
Merge pull request #170 from microsoft/master
SparkSnail May 15, 2019
c7ca451
Merge pull request #171 from microsoft/master
SparkSnail May 17, 2019
40bae6e
Merge pull request #172 from microsoft/master
SparkSnail May 26, 2019
c5acd8c
Merge pull request #173 from microsoft/master
SparkSnail May 27, 2019
bee8f84
Merge pull request #174 from microsoft/master
SparkSnail May 28, 2019
e1a4a80
Merge pull request #175 from microsoft/master
SparkSnail May 30, 2019
8a9b2cb
Merge pull request #177 from microsoft/v0.8
SparkSnail Jun 3, 2019
cbf88f7
Merge pull request #181 from microsoft/master
SparkSnail Jun 5, 2019
0235102
Merge pull request #182 from microsoft/master
SparkSnail Jun 10, 2019
9352cc8
Merge pull request #183 from microsoft/master
SparkSnail Jun 14, 2019
d48ad02
Merge pull request #184 from microsoft/master
SparkSnail Jun 20, 2019
f634334
Merge pull request #185 from microsoft/master
SparkSnail Jun 21, 2019
93dd76b
Merge pull request #186 from microsoft/master
SparkSnail Jun 24, 2019
1500458
Merge pull request #187 from microsoft/master
SparkSnail Jun 24, 2019
80ccfcc
upload code dir seperately
SparkSnail Jul 16, 2019
c3aec18
fix frameworkcontroller
SparkSnail Jul 16, 2019
9915446
remove unused code
SparkSnail Jul 16, 2019
ffed089
fix comments
SparkSnail Jul 22, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -201,19 +201,26 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw new Error('Kubeflow Cluster config is not initialized');
}

if (this.fcTrialConfig === undefined) {
throw new Error('Kubeflow trial config is not initialized');
}

let trialJobOutputUrl: string = '';

if (this.fcClusterConfig.storageType === 'azureStorage') {
if (this.azureStorageClient === undefined) {
throw new Error('azureStorageClient is not initialized');
}
try {
//upload local files to azure storage
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
await AzureStorageClientUtility.uploadDirectory(
this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
//upload code files to azure storage
await AzureStorageClientUtility.uploadDirectory(
this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${this.fcTrialConfig.codeDir}`);

trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/\
${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/` +
`${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
} catch (error) {
this.log.error(error);

Expand All @@ -226,7 +233,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);

// Copy codeDir to NFS mounted dir
await cpp.exec(`cp -r ${this.fcTrialConfig.codeDir}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`;
}
Expand Down Expand Up @@ -257,13 +265,12 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw new Error('frameworkcontroller trial config is not initialized');
}

await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.fcTrialConfig.codeDir} ${trialLocalTempFolder}`);
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);

const installScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);

for (const taskRole of this.fcTrialConfig.taskRoles) {
const runScriptContent: string =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw new Error('Kubeflow Cluster config is not initialized');
}

if (this.kubeflowTrialConfig === undefined) {
throw new Error('Kubeflow Trial config is not initialized');
}

let trialJobOutputUrl: string = '';

assert(this.kubeflowClusterConfig.storage === undefined
Expand All @@ -212,13 +216,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw new Error('azureStorageClient is not initialized');
}
try {
//upload local files to azure storage
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${trialLocalTempFolder}`);
//upload code files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${this.kubeflowTrialConfig.codeDir}`);

trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}\
/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` +
`/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
} catch (error) {
this.log.error(error);

Expand All @@ -228,9 +236,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
// Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir
// Copy script files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);

// Copy codeDir to NFS mounted dir
await cpp.exec(`cp -r ${this.kubeflowTrialConfig.codeDir}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`;
}
Expand All @@ -255,13 +264,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}

//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${kubeflowTrialConfig.codeDir} ${trialLocalTempFolder}`);
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);

// Write worker file content run_worker.sh to local tmp folders
if (kubeflowTrialConfig.worker !== undefined) {
Expand Down