Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Support paiTrainingService on windows #1075

Merged
merged 114 commits into from
May 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
114 commits
Select commit Hold shift + click to select a range
d77a99c
fix remote bug
Dec 25, 2018
695d866
Merge pull request #106 from Microsoft/master
SparkSnail Dec 25, 2018
b7e9799
Merge pull request #107 from Microsoft/master
SparkSnail Dec 27, 2018
7cb03f9
add document
Dec 27, 2018
44d1565
add document
Dec 27, 2018
7ab7386
update
Dec 27, 2018
d9e1ea8
update
Dec 27, 2018
2c225a8
update
Dec 27, 2018
be23f55
update
Dec 29, 2018
6f760ab
Merge pull request #108 from Microsoft/master
SparkSnail Jan 2, 2019
9161209
fix remote issue
Jan 3, 2019
e661c55
fix forEach
Jan 3, 2019
4e5d836
Merge pull request #109 from Microsoft/master
SparkSnail Jan 3, 2019
f80e737
fix conflict
Jan 4, 2019
aefc219
Merge branch 'Microsoft-master'
Jan 4, 2019
4fec2cc
update doc according to comments
Jan 7, 2019
dc45661
Merge pull request #111 from Microsoft/master
SparkSnail Jan 7, 2019
11fec6f
update
Jan 7, 2019
a03a191
update
Jan 7, 2019
7c7832c
update
Jan 7, 2019
2c862dc
Merge pull request #112 from Microsoft/master
SparkSnail Jan 8, 2019
85c015d
remove 'any more'
Jan 8, 2019
85cb472
Merge branch 'master' of https://github.com/SparkSnail/nni
Jan 8, 2019
3784355
Merge pull request #113 from Microsoft/master
SparkSnail Jan 9, 2019
d91c980
Merge pull request #114 from Microsoft/master
SparkSnail Jan 14, 2019
9786650
Merge pull request #115 from Microsoft/master
SparkSnail Jan 17, 2019
ef176d2
Merge pull request #116 from Microsoft/master
SparkSnail Jan 22, 2019
1089e80
Merge pull request #117 from Microsoft/master
SparkSnail Jan 23, 2019
627e823
Merge pull request #119 from Microsoft/master
SparkSnail Jan 24, 2019
b633c26
Merge pull request #120 from Microsoft/master
SparkSnail Jan 25, 2019
035d58b
Merge pull request #121 from Microsoft/master
SparkSnail Feb 11, 2019
cd549df
Merge pull request #122 from Microsoft/master
SparkSnail Feb 12, 2019
964743a
Merge pull request #123 from Microsoft/master
SparkSnail Feb 12, 2019
8422992
Merge pull request #124 from Microsoft/master
SparkSnail Feb 13, 2019
40391ec
Merge pull request #125 from Microsoft/master
SparkSnail Feb 18, 2019
1d84526
Merge pull request #126 from Microsoft/master
SparkSnail Feb 20, 2019
1852457
Merge pull request #127 from Microsoft/master
SparkSnail Feb 23, 2019
754a354
Merge pull request #128 from Microsoft/master
SparkSnail Feb 24, 2019
1ee9735
Merge pull request #129 from Microsoft/master
SparkSnail Feb 25, 2019
9f4485c
Merge pull request #130 from Microsoft/master
SparkSnail Feb 25, 2019
b1c3774
Merge pull request #131 from Microsoft/master
SparkSnail Feb 25, 2019
5d7923e
Merge pull request #132 from Microsoft/master
SparkSnail Feb 25, 2019
281f3dc
Merge pull request #133 from Microsoft/master
SparkSnail Feb 26, 2019
2ce9157
Merge pull request #134 from Microsoft/master
SparkSnail Feb 26, 2019
571a7af
Merge pull request #135 from Microsoft/master
SparkSnail Feb 28, 2019
f09d51a
Merge pull request #136 from Microsoft/master
SparkSnail Mar 1, 2019
41a9a59
Merge pull request #137 from Microsoft/master
SparkSnail Mar 5, 2019
21165b5
Merge pull request #138 from Microsoft/master
SparkSnail Mar 7, 2019
d25f7b5
Merge pull request #139 from Microsoft/master
SparkSnail Mar 11, 2019
17e719e
Merge pull request #140 from Microsoft/master
SparkSnail Mar 12, 2019
e25ffbd
Merge pull request #141 from Microsoft/master
SparkSnail Mar 13, 2019
5e777d2
Merge pull request #142 from Microsoft/master
SparkSnail Mar 14, 2019
6ff24a5
Merge pull request #143 from Microsoft/master
SparkSnail Mar 18, 2019
ccf6c04
Merge pull request #144 from Microsoft/master
SparkSnail Mar 20, 2019
eb5e21c
Merge pull request #145 from Microsoft/master
SparkSnail Mar 20, 2019
f796c60
Merge pull request #146 from Microsoft/master
SparkSnail Mar 21, 2019
e1ae623
Merge pull request #147 from Microsoft/master
SparkSnail Mar 22, 2019
ec41d56
Merge pull request #148 from Microsoft/master
SparkSnail Mar 25, 2019
080ae00
Merge pull request #149 from Microsoft/master
SparkSnail Mar 26, 2019
f0a2d39
Merge pull request #150 from Microsoft/master
SparkSnail Mar 26, 2019
77526d3
Merge pull request #152 from Microsoft/master
SparkSnail Apr 1, 2019
d95c351
Merge pull request #155 from Microsoft/master
SparkSnail Apr 3, 2019
346d49d
Merge pull request #156 from Microsoft/master
SparkSnail Apr 11, 2019
6af4b86
Merge pull request #158 from Microsoft/master
SparkSnail Apr 12, 2019
cf5336d
Merge pull request #159 from Microsoft/master
SparkSnail Apr 15, 2019
aec4977
Merge pull request #160 from Microsoft/master
SparkSnail Apr 16, 2019
b1dfaff
Merge pull request #161 from Microsoft/master
SparkSnail Apr 18, 2019
6c9360a
Merge pull request #162 from Microsoft/master
SparkSnail Apr 19, 2019
0663218
Merge pull request #163 from Microsoft/master
SparkSnail Apr 22, 2019
5187b2c
Merge pull request #164 from Microsoft/master
SparkSnail Apr 22, 2019
5032694
Merge pull request #165 from Microsoft/master
SparkSnail Apr 23, 2019
c577553
Merge pull request #166 from Microsoft/master
SparkSnail May 5, 2019
93d6502
Merge pull request #167 from Microsoft/master
SparkSnail May 6, 2019
defe000
init
May 9, 2019
ca797f0
debug pipeline
May 9, 2019
bbeef79
debug job name
May 9, 2019
ffd22a2
fix job image
May 9, 2019
c4b0fbe
debug
May 9, 2019
f22ebe9
add job dependency
May 9, 2019
dac10df
debug variable
May 9, 2019
514175a
test variable chagne
May 9, 2019
48c9860
debug variable
May 9, 2019
3ebaa8d
test variable
May 9, 2019
2d50c81
debug
May 9, 2019
9ccb655
test variable
May 9, 2019
9c3a4df
debug
May 9, 2019
c8b17ad
test variable
May 9, 2019
7805185
debug examples
May 9, 2019
07e7fc7
fix pool
May 9, 2019
b254e16
debug
May 9, 2019
ce0cfbc
fix name
May 10, 2019
b5eab4b
Merge pull request #168 from microsoft/master
SparkSnail May 10, 2019
4c65664
test python version
May 13, 2019
da993eb
debug python version
May 13, 2019
28c8bec
debug python
May 13, 2019
3635b82
debug image
May 13, 2019
3ecbb1f
debug variable
May 13, 2019
e21413f
debug variable
May 13, 2019
8c2777d
debug variable
May 13, 2019
78adad1
debug variable
May 13, 2019
3fa4641
debug variable
May 13, 2019
b614554
debug variable
May 13, 2019
3a2bc72
debug variable
May 13, 2019
9466a8a
fix python command in config file
May 14, 2019
9d2c732
add validateCodeDir
May 14, 2019
7269b7c
fix count files
May 14, 2019
f39d69e
Merge pull request #169 from microsoft/master
SparkSnail May 14, 2019
680214d
Merge branch 'master' of https://github.com/SparkSnail/nni into dev-w…
May 14, 2019
5fd7cdb
revert yarn.lock
May 14, 2019
b981ee6
refactor cmd in utils.ts
May 14, 2019
6a7c8e6
fix pathJoin
May 15, 2019
5936493
fix bug in hdfsClinet
May 15, 2019
930b4fa
fix root dir
May 15, 2019
ea687b9
fix comments
May 21, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -357,13 +357,18 @@ function countFilesRecursively(directory: string, timeoutMilliSeconds?: number):
});

let fileCount: number = -1;
cpp.exec(`find ${directory} -type f | wc -l`).then((result) => {
let cmd: string;
if(process.platform === "win32") {
cmd = `powershell "Get-ChildItem -Path ${directory} -Recurse -File | Measure-Object | %{$_.Count}"`
} else {
cmd = `find ${directory} -type f | wc -l`;
}
cpp.exec(cmd).then((result) => {
if(result.stdout && parseInt(result.stdout)) {
fileCount = parseInt(result.stdout);
}
deferred.resolve(fileCount);
});

return Promise.race([deferred.promise, delayTimeout]).finally(() => {
clearTimeout(timeoutId);
});
Expand Down Expand Up @@ -459,6 +464,16 @@ function getNewLine(): string{
}
}

/**
* Use '/' to join path instead of '\' for all kinds of platform
* @param path
*/
function unixPathJoin(...paths: any[]): string {
const dir: string = paths.filter((path: any) => path !== '').join('/');
if (dir === '') return '.';
return dir;
}

export {countFilesRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, unixPathJoin,
mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine };
11 changes: 10 additions & 1 deletion src/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,16 @@ mkDirP(getLogDir())
console.error(`Failed to create log dir: ${err.stack}`);
});

process.on('SIGTERM', async () => {
function getStopSignal(): any {
if (process.platform === "win32") {
return 'SIGBREAK';
}
else{
return 'SIGTERM';
}
}

process.on(getStopSignal(), async () => {
const log: Logger = getLogger();
let hasError: boolean = false;
try {
Expand Down
9 changes: 6 additions & 3 deletions src/nni_manager/training_service/pai/hdfsClientUtility.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import * as fs from 'fs';
import { Deferred } from 'ts-deferred';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger } from '../../common/log';
import { unixPathJoin } from '../../common/utils'

/**
* HDFS client utility, including copy file/directory
Expand All @@ -32,15 +33,15 @@ export namespace HDFSClientUtility {
* @param hdfsUserName HDFS user name
*/
function hdfsExpRootDir(hdfsUserName: string): string {
return path.join('/', hdfsUserName, 'nni', 'experiments', getExperimentId());
return '/' + unixPathJoin(hdfsUserName, 'nni', 'experiments', getExperimentId());
}

/**
* Get NNI experiment code directory
* @param hdfsUserName HDFS user name
*/
export function getHdfsExpCodeDir(hdfsUserName: string): string {
return path.join(hdfsExpRootDir(hdfsUserName), 'codeDir');
return unixPathJoin(hdfsExpRootDir(hdfsUserName), 'codeDir');
}

/**
Expand All @@ -49,7 +50,9 @@ export namespace HDFSClientUtility {
* @param trialId NNI trial ID
*/
export function getHdfsTrialWorkDir(hdfsUserName: string, trialId: string): string {
return path.join(hdfsExpRootDir(hdfsUserName), 'trials', trialId);
let root = hdfsExpRootDir(hdfsUserName)
console.log(root)
return unixPathJoin(root, 'trials', trialId);
}

/**
Expand Down
7 changes: 4 additions & 3 deletions src/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ import { delay, generateParamFileName,
getExperimentRootDir, getIPV4Address, getVersion, uniqueString } from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { validateCodeDir } from '../common/util';
import { validateCodeDir, execMkdir } from '../common/util';
import { unixPathJoin } from '../../common/utils'
import { HDFSClientUtility } from './hdfsClientUtility';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
import { PAI_LOG_PATH_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiData';
Expand Down Expand Up @@ -406,12 +407,12 @@ class PAITrainingService implements TrainingService {
}

// Step 1. Prepare PAI job configuration
const hdfsOutputDir : string = path.join(this.hdfsBaseDir, this.experimentId, trialJobId);
const hdfsOutputDir : string = unixPathJoin(this.hdfsBaseDir, this.experimentId, trialJobId);
const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId);

const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
await execMkdir(trialLocalTempFolder);

const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
Expand Down
5 changes: 3 additions & 2 deletions test/generate_ts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def convert_command():

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote'], default='pai')
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local'], default='pai')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
# args for PAI
Expand All @@ -111,4 +111,5 @@ def convert_command():
args = parser.parse_args()

update_training_service_config(args)
convert_command()
if args.ts == 'local':
convert_command()
2 changes: 1 addition & 1 deletion test/pipelines-it-local-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
displayName: 'Install dependencies for integration tests'
- script: |
cd test
python generate_ts_config.py
python generate_ts_config.py --ts local
displayName: 'generate config files'
- script: |
cd test
Expand Down
65 changes: 65 additions & 0 deletions test/pipelines-it-pai-windows.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
jobs:
- job: 'build_docker_image'
timeoutInMinutes: 0
pool:
vmImage: 'Ubuntu 16.04'
steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'

- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'

- script: |
if [ $(build_docker_img) = 'true' ]
then
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
export IMG_TAG=`date -u +%y%m%d%H%M`

echo 'build and upload docker image'
docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
docker push $(test_docker_img_name):$IMG_TAG

export TEST_IMG=$(test_docker_img_name):$IMG_TAG
cd ../../
else
export TEST_IMG=$(existing_docker_img)
fi
echo "##vso[task.setvariable variable=TEST_IMG]$TEST_IMG"
displayName: 'build docker image'
- script:
echo $TEST_IMG
echo "##vso[task.setvariable variable=docker_image;isOutput=true]$TEST_IMG"
name: setvariableStep
displayName: 'set image variable'

- job: 'integration_test_pai'
timeoutInMinutes: 0
dependsOn: build_docker_image
variables:
docker_image: $[ dependencies.build_docker_image.outputs['setvariableStep.docker_image'] ]

steps:
- script: |
set PATH=$(ENV_PATH)
python --version
powershell.exe -file install.ps1
displayName: 'Install nni toolkit via source code'
- script: |
cd test
set PATH=$(ENV_PATH)
python --version
python generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) --vc $(pai_virtual_cluster) --nni_docker_image $(docker_image) --data_dir $(data_dir) --output_dir $(output_dir) --nni_manager_ip $(nni_manager_ip)

python config_test.py --ts pai --exclude multi_phase,smac,bohb
displayName: 'Examples and advanced features tests on pai'