From 1cd5c78250702fca1fd87e96db83a64b69038788 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Sun, 23 Feb 2020 22:07:04 +0800 Subject: [PATCH 01/34] add pipeline for paiK8s --- test/config_test.py | 2 +- test/generate_ts_config.py | 19 ++++++++++- test/pipelines-it-pai.yml | 4 +-- test/pipelines-it-paiYarn.yml | 59 +++++++++++++++++++++++++++++++++++ test/training_service.yml | 18 ++++++++++- 5 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 test/pipelines-it-paiYarn.yml diff --git a/test/config_test.py b/test/config_test.py index 91136a8a95..bce0778d1a 100644 --- a/test/config_test.py +++ b/test/config_test.py @@ -112,7 +112,7 @@ def run(args): parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, default=None) parser.add_argument("--exclude", type=str, default=None) - parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'], default='local') + parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'paiYarn', 'kubeflow', 'frameworkcontroller'], default='local') parser.add_argument("--local_gpu", action='store_true') parser.add_argument("--preinstall", action='store_true') args = parser.parse_args() diff --git a/test/generate_ts_config.py b/test/generate_ts_config.py index fb5784d3b1..ff3bd5cfc7 100644 --- a/test/generate_ts_config.py +++ b/test/generate_ts_config.py @@ -12,7 +12,7 @@ def update_training_service_config(args): config = get_yml_content(TRAINING_SERVICE_FILE) if args.nni_manager_ip is not None: config[args.ts]['nniManagerIp'] = args.nni_manager_ip - if args.ts == 'pai': + if args.ts == 'paiYarn': if args.pai_user is not None: config[args.ts]['paiYarnConfig']['userName'] = args.pai_user if args.pai_pwd is not None: @@ -27,6 +27,23 @@ def update_training_service_config(args): config[args.ts]['trial']['outputDir'] = args.output_dir if args.vc is not None: config[args.ts]['trial']['virtualCluster'] = args.vc + if args.ts == 'pai': + if args.pai_user is not None: + config[args.ts]['paiConfig']['userName'] = args.pai_user + if args.pai_host is not None: + config[args.ts]['paiConfig']['host'] = args.pai_host + if args.pai_token is not None: + config[args.ts]['paiConfig']['token'] = args.pai_token + if args.nni_docker_image is not None: + config[args.ts]['trial']['image'] = args.nni_docker_image + if args.nniManagerNFSMountPath is not None: + config[args.ts]['trial']['nniManagerNFSMountPath'] = args.nniManagerNFSMountPath + if args.containerNFSMountPath is not None: + config[args.ts]['trial']['containerNFSMountPath'] = args.containerNFSMountPath + if args.paiStoragePlugin is not None: + config[args.ts]['trial']['paiStoragePlugin'] = args.paiStoragePlugin + if args.vc is not None: + config[args.ts]['trial']['virtualCluster'] = args.vc elif args.ts == 'kubeflow': if args.nfs_server is not None: config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server diff --git a/test/pipelines-it-pai.yml b/test/pipelines-it-pai.yml index 15f24b591d..c5ed99f184 100644 --- a/test/pipelines-it-pai.yml +++ b/test/pipelines-it-pai.yml @@ -51,8 +51,8 @@ jobs: echo "TEST_IMG:$TEST_IMG" cd test - python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) --vc $(pai_virtual_cluster) \ - --nni_docker_image $TEST_IMG --data_dir $(data_dir) --output_dir $(output_dir) --nni_manager_ip $(nni_manager_ip) + python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --paiStoragePlugin $(paiStoragePlugin)\ + --pai_token $(pai_token) --nniManagerNFSMountPath $(nniManagerNFSMountPath) --containerNFSMountPath $(containerNFSMountPath) --nni_manager_ip $(nni_manager_ip) PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts pai PATH=$HOME/.local/bin:$PATH python3 metrics_test.py diff --git a/test/pipelines-it-paiYarn.yml b/test/pipelines-it-paiYarn.yml new file mode 100644 index 0000000000..ad5ec3b305 --- /dev/null +++ b/test/pipelines-it-paiYarn.yml @@ -0,0 +1,59 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +jobs: +- job: 'integration_test_paiYarn' + timeoutInMinutes: 0 + + steps: + - script: python3 -m pip install --upgrade pip setuptools --user + displayName: 'Install python tools' + + - script: | + cd deployment/pypi + echo 'building prerelease package...' + make build + ls $(Build.SourcesDirectory)/deployment/pypi/dist/ + condition: eq( variables['build_docker_img'], 'true' ) + displayName: 'build nni bdsit_wheel' + + - script: | + source install.sh + displayName: 'Install nni toolkit via source code' + + - script: | + sudo apt-get install swig -y + PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC + PATH=$HOME/.local/bin:$PATH nnictl package install --name=BOHB + displayName: 'Install dependencies for integration tests in PAI mode' + + - script: | + set -e + if [ $(build_docker_img) = 'true' ] + then + cd deployment/pypi + docker login -u $(docker_hub_user) -p $(docker_hub_pwd) + echo 'updating docker file for installing nni from local...' + # update Dockerfile to install NNI in docker image from whl file built in last step + sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile + cat ../docker/Dockerfile + export IMG_TAG=`date -u +%y%m%d%H%M` + + echo 'build and upload docker image' + docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG . + docker push $(test_docker_img_name):$IMG_TAG + + export TEST_IMG=$(test_docker_img_name):$IMG_TAG + cd ../../ + else + export TEST_IMG=$(existing_docker_img) + fi + + echo "TEST_IMG:$TEST_IMG" + cd test + python3 generate_ts_config.py --ts paiYarn --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) --vc $(pai_virtual_cluster) \ + --nni_docker_image $TEST_IMG --data_dir $(data_dir) --output_dir $(output_dir) --nni_manager_ip $(nni_manager_ip) + + PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts paiYarn + PATH=$HOME/.local/bin:$PATH python3 metrics_test.py + displayName: 'integration test' diff --git a/test/training_service.yml b/test/training_service.yml index 2a00acca54..0c5be4df8e 100644 --- a/test/training_service.yml +++ b/test/training_service.yml @@ -52,7 +52,7 @@ frameworkcontroller: local: trainingServicePlatform: local -pai: +paiYarn: nniManagerIp: maxExecDuration: 15m paiYarnConfig: @@ -68,6 +68,22 @@ pai: memoryMB: 8192 outputDir: virtualCluster: +pai: + nniManagerIp: + maxExecDuration: 15m + paiConfig: + host: + host: + userName: + trainingServicePlatform: pai + trial: + gpuNum: 1 + cpuNum: 1 + image: + memoryMB: 8192 + nniManagerNFSMountPath: + containerNFSMountPath: + paiStoragePlugin: remote: machineList: - ip: From 2c4f8c50cf0e1484ba370ab67d5b1d42efdf3b52 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Sun, 23 Feb 2020 22:38:56 +0800 Subject: [PATCH 02/34] fix pipeline --- test/generate_ts_config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/generate_ts_config.py b/test/generate_ts_config.py index ff3bd5cfc7..2824ac53ce 100644 --- a/test/generate_ts_config.py +++ b/test/generate_ts_config.py @@ -111,6 +111,10 @@ def convert_command(): parser.add_argument("--data_dir", type=str) parser.add_argument("--output_dir", type=str) parser.add_argument("--vc", type=str) + parser.add_argument("--pai_token", type=str) + parser.add_argument("--paiStoragePlugin", type=str) + parser.add_argument("--nniManagerNFSMountPath", type=str) + parser.add_argument("--containerNFSMountPath", type=str) # args for kubeflow and frameworkController parser.add_argument("--nfs_server", type=str) parser.add_argument("--nfs_path", type=str) From 0881058226bd0d6e6f8a056d666665dbda749ae5 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Sun, 23 Feb 2020 23:05:37 +0800 Subject: [PATCH 03/34] fix token --- test/training_service.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/test/training_service.yml b/test/training_service.yml index 0c5be4df8e..040342da67 100644 --- a/test/training_service.yml +++ b/test/training_service.yml @@ -72,7 +72,6 @@ pai: nniManagerIp: maxExecDuration: 15m paiConfig: - host: host: userName: trainingServicePlatform: pai From dad062f49f031d6ccca135cd5f057e0770e022f9 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Tue, 25 Feb 2020 22:20:36 +0800 Subject: [PATCH 04/34] exclude multiphase --- test/pipelines-it-pai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pipelines-it-pai.yml b/test/pipelines-it-pai.yml index c5ed99f184..70c276ce7e 100644 --- a/test/pipelines-it-pai.yml +++ b/test/pipelines-it-pai.yml @@ -54,6 +54,6 @@ jobs: python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --paiStoragePlugin $(paiStoragePlugin)\ --pai_token $(pai_token) --nniManagerNFSMountPath $(nniManagerNFSMountPath) --containerNFSMountPath $(containerNFSMountPath) --nni_manager_ip $(nni_manager_ip) - PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts pai + PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts pai --exclude multi_phase PATH=$HOME/.local/bin:$PATH python3 metrics_test.py displayName: 'integration test' From e12bf395018d013f8e1627bef317d9cff5c64b64 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Thu, 27 Feb 2020 17:58:08 +0800 Subject: [PATCH 05/34] fix comments --- test/generate_ts_config.py | 12 ++++++------ test/pipelines-it-pai.yml | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/generate_ts_config.py b/test/generate_ts_config.py index 2824ac53ce..efa24ee4e2 100644 --- a/test/generate_ts_config.py +++ b/test/generate_ts_config.py @@ -37,11 +37,11 @@ def update_training_service_config(args): if args.nni_docker_image is not None: config[args.ts]['trial']['image'] = args.nni_docker_image if args.nniManagerNFSMountPath is not None: - config[args.ts]['trial']['nniManagerNFSMountPath'] = args.nniManagerNFSMountPath + config[args.ts]['trial']['nniManagerNFSMountPath'] = args.nni_manager_nfs_mount_path if args.containerNFSMountPath is not None: - config[args.ts]['trial']['containerNFSMountPath'] = args.containerNFSMountPath + config[args.ts]['trial']['containerNFSMountPath'] = args.container_nfs_mount_path if args.paiStoragePlugin is not None: - config[args.ts]['trial']['paiStoragePlugin'] = args.paiStoragePlugin + config[args.ts]['trial']['paiStoragePlugin'] = args.pai_storage_plugin if args.vc is not None: config[args.ts]['trial']['virtualCluster'] = args.vc elif args.ts == 'kubeflow': @@ -112,9 +112,9 @@ def convert_command(): parser.add_argument("--output_dir", type=str) parser.add_argument("--vc", type=str) parser.add_argument("--pai_token", type=str) - parser.add_argument("--paiStoragePlugin", type=str) - parser.add_argument("--nniManagerNFSMountPath", type=str) - parser.add_argument("--containerNFSMountPath", type=str) + parser.add_argument("--pai_storage_plugin", type=str) + parser.add_argument("--nni_manager_nfs_mount_path", type=str) + parser.add_argument("--container_nfs_mount_path", type=str) # args for kubeflow and frameworkController parser.add_argument("--nfs_server", type=str) parser.add_argument("--nfs_path", type=str) diff --git a/test/pipelines-it-pai.yml b/test/pipelines-it-pai.yml index 70c276ce7e..d0d04afc07 100644 --- a/test/pipelines-it-pai.yml +++ b/test/pipelines-it-pai.yml @@ -51,8 +51,8 @@ jobs: echo "TEST_IMG:$TEST_IMG" cd test - python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --paiStoragePlugin $(paiStoragePlugin)\ - --pai_token $(pai_token) --nniManagerNFSMountPath $(nniManagerNFSMountPath) --containerNFSMountPath $(containerNFSMountPath) --nni_manager_ip $(nni_manager_ip) + python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_plugin $(pai_storage_plugin)\ + --pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts pai --exclude multi_phase PATH=$HOME/.local/bin:$PATH python3 metrics_test.py From 5dee410d8da42235907b253d16b577f493635bb4 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Thu, 5 Mar 2020 14:52:11 +0800 Subject: [PATCH 06/34] fix pipeline --- test/generate_ts_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/generate_ts_config.py b/test/generate_ts_config.py index efa24ee4e2..dbebdc45b6 100644 --- a/test/generate_ts_config.py +++ b/test/generate_ts_config.py @@ -36,11 +36,11 @@ def update_training_service_config(args): config[args.ts]['paiConfig']['token'] = args.pai_token if args.nni_docker_image is not None: config[args.ts]['trial']['image'] = args.nni_docker_image - if args.nniManagerNFSMountPath is not None: + if args.nni_manager_nfs_mount_path is not None: config[args.ts]['trial']['nniManagerNFSMountPath'] = args.nni_manager_nfs_mount_path - if args.containerNFSMountPath is not None: + if args.container_nfs_mount_path is not None: config[args.ts]['trial']['containerNFSMountPath'] = args.container_nfs_mount_path - if args.paiStoragePlugin is not None: + if args.pai_storage_plugin is not None: config[args.ts]['trial']['paiStoragePlugin'] = args.pai_storage_plugin if args.vc is not None: config[args.ts]['trial']['virtualCluster'] = args.vc From f855f018cc62f44a505dd6af42c29d50ea7d4a50 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 31 Mar 2020 19:04:57 +0800 Subject: [PATCH 07/34] fix pai-windows --- test/pipelines/pipelines-it-pai-windows.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/pipelines/pipelines-it-pai-windows.yml b/test/pipelines/pipelines-it-pai-windows.yml index f1fc6d1f05..8e412239fb 100644 --- a/test/pipelines/pipelines-it-pai-windows.yml +++ b/test/pipelines/pipelines-it-pai-windows.yml @@ -63,6 +63,7 @@ jobs: cd test set PATH=$(ENV_PATH) python --version - python nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) --vc $(pai_virtual_cluster) --nni_docker_image $(docker_image) --data_dir $(data_dir) --output_dir $(output_dir) --nni_manager_ip $(nni_manager_ip) + python nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_plugin $(pai_storage_plugin)\ + --pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase displayName: 'Examples and advanced features tests on pai' \ No newline at end of file From 79641246a2c0d46d19a2dd806d49e2f4b322bc5b Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 31 Mar 2020 22:59:35 +0800 Subject: [PATCH 08/34] debug --- test/pipelines/pipelines-it-pai-windows.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/pipelines/pipelines-it-pai-windows.yml b/test/pipelines/pipelines-it-pai-windows.yml index 8e412239fb..e0680d5c93 100644 --- a/test/pipelines/pipelines-it-pai-windows.yml +++ b/test/pipelines/pipelines-it-pai-windows.yml @@ -63,6 +63,9 @@ jobs: cd test set PATH=$(ENV_PATH) python --version + echo $(container_nfs_mount_path) + echo $(pai_token) + echo $(nni_manager_ip) python nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_plugin $(pai_storage_plugin)\ --pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase From 4b764999f167a6270a6c5e273fac9df7069262e0 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Wed, 1 Apr 2020 00:02:57 +0800 Subject: [PATCH 09/34] fix error --- test/pipelines/pipelines-it-pai-windows.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test/pipelines/pipelines-it-pai-windows.yml b/test/pipelines/pipelines-it-pai-windows.yml index e0680d5c93..aa752edf8f 100644 --- a/test/pipelines/pipelines-it-pai-windows.yml +++ b/test/pipelines/pipelines-it-pai-windows.yml @@ -63,10 +63,6 @@ jobs: cd test set PATH=$(ENV_PATH) python --version - echo $(container_nfs_mount_path) - echo $(pai_token) - echo $(nni_manager_ip) - python nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_plugin $(pai_storage_plugin)\ - --pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) + python nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_plugin $(pai_storage_plugin) --pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase displayName: 'Examples and advanced features tests on pai' \ No newline at end of file From a2678853ab525f3354bc5b8f6fcbca2772597456 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Apr 2020 10:24:23 +0800 Subject: [PATCH 10/34] debug --- test/pipelines/pipelines-it-pai-windows.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/pipelines/pipelines-it-pai-windows.yml b/test/pipelines/pipelines-it-pai-windows.yml index aa752edf8f..d8c00ef471 100644 --- a/test/pipelines/pipelines-it-pai-windows.yml +++ b/test/pipelines/pipelines-it-pai-windows.yml @@ -63,6 +63,7 @@ jobs: cd test set PATH=$(ENV_PATH) python --version - python nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_plugin $(pai_storage_plugin) --pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) + echo $(pai_token) + python nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $(existing_docker_img) --pai_storage_plugin $(pai_storage_plugin) --pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase displayName: 'Examples and advanced features tests on pai' \ No newline at end of file From 7b4d59d8b8d94800191dc607b0d97aa25a44c2a8 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 2 Apr 2020 23:07:19 +0800 Subject: [PATCH 11/34] debug --- test/pipelines/pipelines-it-pai-windows.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/pipelines/pipelines-it-pai-windows.yml b/test/pipelines/pipelines-it-pai-windows.yml index d8c00ef471..c2a743b5ed 100644 --- a/test/pipelines/pipelines-it-pai-windows.yml +++ b/test/pipelines/pipelines-it-pai-windows.yml @@ -64,6 +64,7 @@ jobs: set PATH=$(ENV_PATH) python --version echo $(pai_token) - python nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $(existing_docker_img) --pai_storage_plugin $(pai_storage_plugin) --pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) + echo $($(nni_manager_ip) + python nni_test/nnitest/generate_ts_config.py --ts pai --pai_token $(pai_token) --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $(existing_docker_img) --pai_storage_plugin $(pai_storage_plugin) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase displayName: 'Examples and advanced features tests on pai' \ No newline at end of file From b535918d011e97c91d28da8d84d86b4ea4b723fd Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 3 Apr 2020 01:44:07 +0800 Subject: [PATCH 12/34] mount folder --- test/pipelines/pipelines-it-pai-windows.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/test/pipelines/pipelines-it-pai-windows.yml b/test/pipelines/pipelines-it-pai-windows.yml index c2a743b5ed..b84fb26a70 100644 --- a/test/pipelines/pipelines-it-pai-windows.yml +++ b/test/pipelines/pipelines-it-pai-windows.yml @@ -65,6 +65,7 @@ jobs: python --version echo $(pai_token) echo $($(nni_manager_ip) + mount -o anon \\10.151.41.14\data\share\drbdha\data Z: python nni_test/nnitest/generate_ts_config.py --ts pai --pai_token $(pai_token) --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $(existing_docker_img) --pai_storage_plugin $(pai_storage_plugin) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase displayName: 'Examples and advanced features tests on pai' \ No newline at end of file From 736aa86fe5d644667cd74ca1fe9f788eab1b4e6b Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 3 Apr 2020 02:03:14 +0800 Subject: [PATCH 13/34] fix execopy --- src/nni_manager/training_service/common/util.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/common/util.ts b/src/nni_manager/training_service/common/util.ts index 0d578ac7e7..9328f01e61 100644 --- a/src/nni_manager/training_service/common/util.ts +++ b/src/nni_manager/training_service/common/util.ts @@ -69,7 +69,7 @@ export async function execMkdir(directory: string, share: boolean = false): Prom */ export async function execCopydir(source: string, destination: string): Promise { if (process.platform === 'win32') { - await cpp.exec(`powershell.exe Copy-Item "${source}" -Destination "${destination}" -Recurse`); + await cpp.exec(`powershell.exe Copy-Item "${source}\\*" -Destination "${destination}" -Recurse`); } else { await cpp.exec(`cp -r '${source}/.' '${destination}'`); } From 3f7c0939b12b9adc344f43f4d3386d4f18a0cf57 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 3 Apr 2020 15:26:18 +0800 Subject: [PATCH 14/34] fix test image --- test/pipelines/pipelines-it-pai-windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pipelines/pipelines-it-pai-windows.yml b/test/pipelines/pipelines-it-pai-windows.yml index b84fb26a70..71809b1d64 100644 --- a/test/pipelines/pipelines-it-pai-windows.yml +++ b/test/pipelines/pipelines-it-pai-windows.yml @@ -66,6 +66,6 @@ jobs: echo $(pai_token) echo $($(nni_manager_ip) mount -o anon \\10.151.41.14\data\share\drbdha\data Z: - python nni_test/nnitest/generate_ts_config.py --ts pai --pai_token $(pai_token) --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $(existing_docker_img) --pai_storage_plugin $(pai_storage_plugin) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) + python nni_test/nnitest/generate_ts_config.py --ts pai --pai_token $(pai_token) --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_plugin $(pai_storage_plugin) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase displayName: 'Examples and advanced features tests on pai' \ No newline at end of file From 7e39b24285435ff1adee2870170f1df542ac50b2 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 3 Apr 2020 16:06:50 +0800 Subject: [PATCH 15/34] fix docker image --- test/pipelines/pipelines-it-pai-windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pipelines/pipelines-it-pai-windows.yml b/test/pipelines/pipelines-it-pai-windows.yml index 71809b1d64..4fc8c883c3 100644 --- a/test/pipelines/pipelines-it-pai-windows.yml +++ b/test/pipelines/pipelines-it-pai-windows.yml @@ -66,6 +66,6 @@ jobs: echo $(pai_token) echo $($(nni_manager_ip) mount -o anon \\10.151.41.14\data\share\drbdha\data Z: - python nni_test/nnitest/generate_ts_config.py --ts pai --pai_token $(pai_token) --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_plugin $(pai_storage_plugin) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) + python nni_test/nnitest/generate_ts_config.py --ts pai --pai_token $(pai_token) --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $(docker_image) --pai_storage_plugin $(pai_storage_plugin) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase displayName: 'Examples and advanced features tests on pai' \ No newline at end of file From 5396657725b75fd3d2599ff92d18bd4988639ad7 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sat, 4 Apr 2020 15:16:35 +0800 Subject: [PATCH 16/34] set nfs path in variable --- test/pipelines/pipelines-it-pai-windows.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/pipelines/pipelines-it-pai-windows.yml b/test/pipelines/pipelines-it-pai-windows.yml index 4fc8c883c3..b1ab2e1b69 100644 --- a/test/pipelines/pipelines-it-pai-windows.yml +++ b/test/pipelines/pipelines-it-pai-windows.yml @@ -63,9 +63,7 @@ jobs: cd test set PATH=$(ENV_PATH) python --version - echo $(pai_token) - echo $($(nni_manager_ip) - mount -o anon \\10.151.41.14\data\share\drbdha\data Z: + mount -o anon $(pai_nfs_uri) $(local_nfs_uri) python nni_test/nnitest/generate_ts_config.py --ts pai --pai_token $(pai_token) --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $(docker_image) --pai_storage_plugin $(pai_storage_plugin) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase displayName: 'Examples and advanced features tests on pai' \ No newline at end of file From 42572bc1adcbe857faccbbe9158fc6cfce5efb66 Mon Sep 17 00:00:00 2001 From: liuzhe Date: Tue, 3 Aug 2021 11:37:19 +0800 Subject: [PATCH 17/34] fix openpai v1 --- nni/experiment/config/convert.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nni/experiment/config/convert.py b/nni/experiment/config/convert.py index b07bf3acb0..68a5dafe0d 100644 --- a/nni/experiment/config/convert.py +++ b/nni/experiment/config/convert.py @@ -16,6 +16,8 @@ def to_v2(v1) -> ExperimentConfig: v1 = copy.deepcopy(v1) platform = v1.pop('trainingServicePlatform') + if platform == 'pai': + platform = 'openpai' assert platform in ['local', 'remote', 'openpai', 'aml'] v2 = ExperimentConfig(platform) From 732c6ebe5d5c0c4fffebb14a327bee664b456f32 Mon Sep 17 00:00:00 2001 From: quzha Date: Tue, 3 Aug 2021 12:58:37 +0800 Subject: [PATCH 18/34] update agent pool --- pipelines/integration-test-openpai-linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/integration-test-openpai-linux.yml b/pipelines/integration-test-openpai-linux.yml index 5685d275ae..a574a6faea 100644 --- a/pipelines/integration-test-openpai-linux.yml +++ b/pipelines/integration-test-openpai-linux.yml @@ -13,7 +13,7 @@ schedules: jobs: - job: pai - pool: NNI CI PAI CLI + pool: NNI CI PAI LINUX CLI timeoutInMinutes: 120 steps: From 5d35809e7e0fab05329df2edb14f2a54e409c4dd Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Thu, 5 Aug 2021 11:45:57 +0800 Subject: [PATCH 19/34] debug pipeline --- pipelines/integration-test-openpai-linux.yml | 34 ++++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pipelines/integration-test-openpai-linux.yml b/pipelines/integration-test-openpai-linux.yml index a574a6faea..4b41052075 100644 --- a/pipelines/integration-test-openpai-linux.yml +++ b/pipelines/integration-test-openpai-linux.yml @@ -53,23 +53,23 @@ jobs: condition: eq(variables['build_docker_image'], 'true') displayName: Build and upload docker image - - script: | - set -e - cd test - python3 nni_test/nnitest/generate_ts_config.py \ - --ts pai \ - --pai_reuse false \ - --pai_host https://ne.openpai.org \ - --pai_user $(pai_user) \ - --nni_docker_image nnidev/nni-nightly \ - --pai_storage_config_name confignfs-data \ - --pai_token $(pai_token) \ - --nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \ - --container_nfs_mount_path /mnt/confignfs-data/shinyang3 \ - --nni_manager_ip $(manager_ip) \ - --vc nni - python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai - displayName: Integration test + # - script: | + # set -e + # cd test + # python3 nni_test/nnitest/generate_ts_config.py \ + # --ts pai \ + # --pai_reuse false \ + # --pai_host https://ne.openpai.org \ + # --pai_user $(pai_user) \ + # --nni_docker_image nnidev/nni-nightly \ + # --pai_storage_config_name confignfs-data \ + # --pai_token $(pai_token) \ + # --nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \ + # --container_nfs_mount_path /mnt/confignfs-data/shinyang3 \ + # --nni_manager_ip $(manager_ip) \ + # --vc nni + # python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai + # displayName: Integration test - script: | set -e From 3ddccee623b66c06382d67d9a952734494e812ed Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Thu, 5 Aug 2021 11:46:46 +0800 Subject: [PATCH 20/34] fix pipeline --- pipelines/integration-test-openpai-linux.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pipelines/integration-test-openpai-linux.yml b/pipelines/integration-test-openpai-linux.yml index 4b41052075..5b3ed91c37 100644 --- a/pipelines/integration-test-openpai-linux.yml +++ b/pipelines/integration-test-openpai-linux.yml @@ -43,15 +43,15 @@ jobs: nnictl algo register --meta meta_file.yml displayName: Install customized tuner - - script: | - set -e - docker login -u nnidev -p $(docker_hub_password) - echo '## Build docker image ##' - docker build --build-arg NNI_RELEASE=${NNI_RELEASE} -t nnidev/nni-nightly . - echo '## Upload docker image ##' - docker push nnidev/nni-nightly - condition: eq(variables['build_docker_image'], 'true') - displayName: Build and upload docker image + # - script: | + # set -e + # docker login -u nnidev -p $(docker_hub_password) + # echo '## Build docker image ##' + # docker build --build-arg NNI_RELEASE=${NNI_RELEASE} -t nnidev/nni-nightly . + # echo '## Upload docker image ##' + # docker push nnidev/nni-nightly + # condition: eq(variables['build_docker_image'], 'true') + # displayName: Build and upload docker image # - script: | # set -e From 7391b9ea2a3c0913564f46508bd4565e41f96817 Mon Sep 17 00:00:00 2001 From: liuzhe Date: Thu, 5 Aug 2021 13:25:02 +0800 Subject: [PATCH 21/34] fix openpai --- ts/nni_manager/training_service/pai/paiTrainingService.ts | 2 +- .../reusable/environments/openPaiEnvironmentService.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ts/nni_manager/training_service/pai/paiTrainingService.ts b/ts/nni_manager/training_service/pai/paiTrainingService.ts index 89fbbf4f07..13f0b05b6f 100644 --- a/ts/nni_manager/training_service/pai/paiTrainingService.ts +++ b/ts/nni_manager/training_service/pai/paiTrainingService.ts @@ -78,7 +78,7 @@ class PAITrainingService implements TrainingService { private async copyTrialCode(): Promise { await validateCodeDir(this.config.trialCodeDirectory); - const nniManagerNFSExpCodeDir = path.join(this.config.trialCodeDirectory, this.experimentId, 'nni-code'); + const nniManagerNFSExpCodeDir = path.join(this.config.localStorageMountPoint, this.experimentId, 'nni-code'); await execMkdir(nniManagerNFSExpCodeDir); this.log.info(`Starting copy codeDir data from ${this.config.trialCodeDirectory} to ${nniManagerNFSExpCodeDir}`); await execCopydir(this.config.trialCodeDirectory, nniManagerNFSExpCodeDir); diff --git a/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index 222b0b9e53..977e4f46f9 100644 --- a/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -40,7 +40,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService { this.protocol = this.config.host.toLowerCase().startsWith('https://') ? 'https' : 'http'; // FIXME: only support MountedStorageService - const storageService = new MountedStorageService(); + const storageService = component.get(MountedStorageService); const remoteRoot = storageService.joinPath(this.config.localStorageMountPoint, this.experimentId); storageService.initialize(this.config.localStorageMountPoint, remoteRoot); } From b52163e8495ead1af81ee9131624067199dd2d15 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Thu, 5 Aug 2021 13:30:39 +0800 Subject: [PATCH 22/34] fix pai --- .../reusable/environments/openPaiEnvironmentService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index 3191aeeed3..c562d9dfd9 100644 --- a/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -289,7 +289,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService { taskRetryCount: 0, dockerImage: 'docker_image_0', resourcePerInstance: { - gpu: this.config.trialGpuNumber, + gpu: this.config.trialGpuNumber === undefined? 0: this.config.trialGpuNumber, cpu: this.config.trialCpuNumber, memoryMB: toMegaBytes(this.config.trialMemorySize) }, From 5f43cf7e36292082eb6f09fd9f3dca37d5feed8f Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Thu, 5 Aug 2021 14:28:59 +0800 Subject: [PATCH 23/34] fix pai --- nni/experiment/config/convert.py | 6 +++--- nni/experiment/config/openpai.py | 1 + ts/nni_manager/common/experimentConfig.ts | 1 + .../reusable/environments/openPaiEnvironmentService.ts | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/nni/experiment/config/convert.py b/nni/experiment/config/convert.py index b07bf3acb0..73ef91f1e7 100644 --- a/nni/experiment/config/convert.py +++ b/nni/experiment/config/convert.py @@ -16,7 +16,7 @@ def to_v2(v1) -> ExperimentConfig: v1 = copy.deepcopy(v1) platform = v1.pop('trainingServicePlatform') - assert platform in ['local', 'remote', 'openpai', 'aml'] + assert platform in ['local', 'remote', 'pai', 'aml'] v2 = ExperimentConfig(platform) _drop_field(v1, 'authorName') @@ -81,14 +81,14 @@ def to_v2(v1) -> ExperimentConfig: _move_field(v1_machine, v2_machine, 'passwd', 'password') assert not v1_machine, v1_machine - if platform == 'openpai': + if platform == 'pai': _move_field(v1_trial, ts, 'nniManagerNFSMountPath', 'local_storage_mount_point') _move_field(v1_trial, ts, 'containerNFSMountPath', 'container_storage_mount_point') _move_field(v1_trial, ts, 'cpuNum', 'trial_cpu_number') if 'memoryMB' in v1_trial: ts.trial_memory_size = str(v1_trial.pop('memoryMB')) + 'mb' _move_field(v1_trial, ts, 'image', 'docker_image') - _deprecate(v1_trial, v2, 'virtualCluster') + _move_field(v1_trial, ts, 'virtualCluster', 'virtual_cluster') _move_field(v1_trial, ts, 'paiStorageConfigName', 'storage_config_name') _move_field(v1_trial, ts, 'paiConfigPath', 'openpaiConfigFile') diff --git a/nni/experiment/config/openpai.py b/nni/experiment/config/openpai.py index 66eecadac7..d17f68d1da 100644 --- a/nni/experiment/config/openpai.py +++ b/nni/experiment/config/openpai.py @@ -21,6 +21,7 @@ class OpenpaiConfig(TrainingServiceConfig): trial_memory_size: str storage_config_name: str docker_image: str = 'msranni/nni:latest' + virtual_cluster: str local_storage_mount_point: PathLike container_storage_mount_point: str reuse_mode: bool = True diff --git a/ts/nni_manager/common/experimentConfig.ts b/ts/nni_manager/common/experimentConfig.ts index 7713bb94d4..53575d90bf 100644 --- a/ts/nni_manager/common/experimentConfig.ts +++ b/ts/nni_manager/common/experimentConfig.ts @@ -58,6 +58,7 @@ export interface OpenpaiConfig extends TrainingServiceConfig { containerStorageMountPoint: string; reuseMode: boolean; openpaiConfig?: object; + virtualCluster?: string; } /* AML */ diff --git a/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index c562d9dfd9..de913f0e65 100644 --- a/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -307,9 +307,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService { submitFrom: 'submit-job-v2' } } - if (this.config.deprecated && this.config.deprecated.virtualCluster) { + if (this.config.virtualCluster) { nniJobConfig.defaults = { - virtualCluster: this.config.deprecated.virtualCluster + virtualCluster: this.config.virtualCluster } } } From c213f1d91ee173a05fa98a136003ef7d17a062a0 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Thu, 5 Aug 2021 14:40:34 +0800 Subject: [PATCH 24/34] fix pai --- nni/experiment/config/convert.py | 4 +++- nni/experiment/config/openpai.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/nni/experiment/config/convert.py b/nni/experiment/config/convert.py index 73ef91f1e7..254e4a498d 100644 --- a/nni/experiment/config/convert.py +++ b/nni/experiment/config/convert.py @@ -17,6 +17,8 @@ def to_v2(v1) -> ExperimentConfig: v1 = copy.deepcopy(v1) platform = v1.pop('trainingServicePlatform') assert platform in ['local', 'remote', 'pai', 'aml'] + if platform == 'pai': + platform = 'openpai' v2 = ExperimentConfig(platform) _drop_field(v1, 'authorName') @@ -81,7 +83,7 @@ def to_v2(v1) -> ExperimentConfig: _move_field(v1_machine, v2_machine, 'passwd', 'password') assert not v1_machine, v1_machine - if platform == 'pai': + if platform == 'openpai': _move_field(v1_trial, ts, 'nniManagerNFSMountPath', 'local_storage_mount_point') _move_field(v1_trial, ts, 'containerNFSMountPath', 'container_storage_mount_point') _move_field(v1_trial, ts, 'cpuNum', 'trial_cpu_number') diff --git a/nni/experiment/config/openpai.py b/nni/experiment/config/openpai.py index d17f68d1da..e941530f84 100644 --- a/nni/experiment/config/openpai.py +++ b/nni/experiment/config/openpai.py @@ -21,7 +21,7 @@ class OpenpaiConfig(TrainingServiceConfig): trial_memory_size: str storage_config_name: str docker_image: str = 'msranni/nni:latest' - virtual_cluster: str + virtual_cluster: Optional[str] local_storage_mount_point: PathLike container_storage_mount_point: str reuse_mode: bool = True From 7332688078401e218fd2dacda65a3b88877dbcce Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Thu, 5 Aug 2021 15:07:17 +0800 Subject: [PATCH 25/34] fix build --- ts/nni_manager/common/experimentConfig.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ts/nni_manager/common/experimentConfig.ts b/ts/nni_manager/common/experimentConfig.ts index 53575d90bf..6a3467722e 100644 --- a/ts/nni_manager/common/experimentConfig.ts +++ b/ts/nni_manager/common/experimentConfig.ts @@ -199,7 +199,7 @@ export function toSeconds(time: string): number { throw new Error(`Bad time string "${time}"`); } -const sizeUnits = { tb: 1024 * 1024, gb: 1024 * 1024, mb: 1, kb: 1 / 1024 }; +const sizeUnits = { tb: 1024 * 1024, gb: 1024, mb: 1, kb: 1 / 1024 }; export function toMegaBytes(size: string): number { for (const [unit, factor] of Object.entries(sizeUnits)) { From 5c14bc4ae5d47aa136d106cb9163bef18a6a5cdb Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Thu, 5 Aug 2021 15:55:14 +0800 Subject: [PATCH 26/34] revert change --- pipelines/integration-test-openpai-linux.yml | 52 ++++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/pipelines/integration-test-openpai-linux.yml b/pipelines/integration-test-openpai-linux.yml index 5b3ed91c37..a574a6faea 100644 --- a/pipelines/integration-test-openpai-linux.yml +++ b/pipelines/integration-test-openpai-linux.yml @@ -43,33 +43,33 @@ jobs: nnictl algo register --meta meta_file.yml displayName: Install customized tuner - # - script: | - # set -e - # docker login -u nnidev -p $(docker_hub_password) - # echo '## Build docker image ##' - # docker build --build-arg NNI_RELEASE=${NNI_RELEASE} -t nnidev/nni-nightly . - # echo '## Upload docker image ##' - # docker push nnidev/nni-nightly - # condition: eq(variables['build_docker_image'], 'true') - # displayName: Build and upload docker image + - script: | + set -e + docker login -u nnidev -p $(docker_hub_password) + echo '## Build docker image ##' + docker build --build-arg NNI_RELEASE=${NNI_RELEASE} -t nnidev/nni-nightly . + echo '## Upload docker image ##' + docker push nnidev/nni-nightly + condition: eq(variables['build_docker_image'], 'true') + displayName: Build and upload docker image - # - script: | - # set -e - # cd test - # python3 nni_test/nnitest/generate_ts_config.py \ - # --ts pai \ - # --pai_reuse false \ - # --pai_host https://ne.openpai.org \ - # --pai_user $(pai_user) \ - # --nni_docker_image nnidev/nni-nightly \ - # --pai_storage_config_name confignfs-data \ - # --pai_token $(pai_token) \ - # --nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \ - # --container_nfs_mount_path /mnt/confignfs-data/shinyang3 \ - # --nni_manager_ip $(manager_ip) \ - # --vc nni - # python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai - # displayName: Integration test + - script: | + set -e + cd test + python3 nni_test/nnitest/generate_ts_config.py \ + --ts pai \ + --pai_reuse false \ + --pai_host https://ne.openpai.org \ + --pai_user $(pai_user) \ + --nni_docker_image nnidev/nni-nightly \ + --pai_storage_config_name confignfs-data \ + --pai_token $(pai_token) \ + --nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \ + --container_nfs_mount_path /mnt/confignfs-data/shinyang3 \ + --nni_manager_ip $(manager_ip) \ + --vc nni + python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai + displayName: Integration test - script: | set -e From c904ac8d21528f75e2f7e3ac785ba3d4c48131b0 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Tue, 10 Aug 2021 00:27:47 +0800 Subject: [PATCH 27/34] fix pai --- ts/nni_manager/training_service/pai/paiTrainingService.ts | 2 +- .../training_service/reusable/routerTrainingService.ts | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ts/nni_manager/training_service/pai/paiTrainingService.ts b/ts/nni_manager/training_service/pai/paiTrainingService.ts index 89fbbf4f07..13f0b05b6f 100644 --- a/ts/nni_manager/training_service/pai/paiTrainingService.ts +++ b/ts/nni_manager/training_service/pai/paiTrainingService.ts @@ -78,7 +78,7 @@ class PAITrainingService implements TrainingService { private async copyTrialCode(): Promise { await validateCodeDir(this.config.trialCodeDirectory); - const nniManagerNFSExpCodeDir = path.join(this.config.trialCodeDirectory, this.experimentId, 'nni-code'); + const nniManagerNFSExpCodeDir = path.join(this.config.localStorageMountPoint, this.experimentId, 'nni-code'); await execMkdir(nniManagerNFSExpCodeDir); this.log.info(`Starting copy codeDir data from ${this.config.trialCodeDirectory} to ${nniManagerNFSExpCodeDir}`); await execCopydir(this.config.trialCodeDirectory, nniManagerNFSExpCodeDir); diff --git a/ts/nni_manager/training_service/reusable/routerTrainingService.ts b/ts/nni_manager/training_service/reusable/routerTrainingService.ts index 5e22c55495..ac681b943a 100644 --- a/ts/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/ts/nni_manager/training_service/reusable/routerTrainingService.ts @@ -26,11 +26,11 @@ class RouterTrainingService implements TrainingService { const instance = new RouterTrainingService(); instance.log = getLogger('RouterTrainingService'); const platform = Array.isArray(config.trainingService) ? 'hybrid' : config.trainingService.platform; - if (platform === 'remote' && !(config.trainingService).reuseMode) { + if (platform === 'remote' && (config.trainingService).reuseMode === false) { instance.internalTrainingService = new RemoteMachineTrainingService(config); - } else if (platform === 'openpai' && !(config.trainingService).reuseMode) { + } else if (platform === 'openpai' && (config.trainingService).reuseMode === false) { instance.internalTrainingService = new PAITrainingService(config); - } else if (platform === 'kubeflow' && !(config.trainingService).reuseMode) { + } else if (platform === 'kubeflow' && (config.trainingService).reuseMode === false) { instance.internalTrainingService = new KubeflowTrainingService(); } else { instance.internalTrainingService = await TrialDispatcher.construct(config); From ea7abd9025f8e6f151d424abd8662c91e7bfc6b2 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Tue, 10 Aug 2021 13:07:31 +0800 Subject: [PATCH 28/34] fix pipeline --- pipelines/integration-test-openpai-linux.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/integration-test-openpai-linux.yml b/pipelines/integration-test-openpai-linux.yml index a574a6faea..92bec3548c 100644 --- a/pipelines/integration-test-openpai-linux.yml +++ b/pipelines/integration-test-openpai-linux.yml @@ -64,8 +64,8 @@ jobs: --nni_docker_image nnidev/nni-nightly \ --pai_storage_config_name confignfs-data \ --pai_token $(pai_token) \ - --nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \ - --container_nfs_mount_path /mnt/confignfs-data/shinyang3 \ + --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \ + --container_nfs_mount_path $(container_nfs_mount_path) \ --nni_manager_ip $(manager_ip) \ --vc nni python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai @@ -82,8 +82,8 @@ jobs: --nni_docker_image nnidev/nni-nightly \ --pai_storage_config_name confignfs-data \ --pai_token $(pai_token) \ - --nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \ - --container_nfs_mount_path /mnt/confignfs-data/shinyang3 \ + --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \ + --container_nfs_mount_path $(container_nfs_mount_path) \ --nni_manager_ip $(manager_ip) \ --vc nni python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai From bcb2c10379b7a932b40fe895c4a0384b77197fda Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Tue, 10 Aug 2021 14:22:42 +0800 Subject: [PATCH 29/34] debug --- pipelines/integration-test-openpai-linux.yml | 34 ++++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pipelines/integration-test-openpai-linux.yml b/pipelines/integration-test-openpai-linux.yml index 92bec3548c..aab1887386 100644 --- a/pipelines/integration-test-openpai-linux.yml +++ b/pipelines/integration-test-openpai-linux.yml @@ -53,23 +53,23 @@ jobs: condition: eq(variables['build_docker_image'], 'true') displayName: Build and upload docker image - - script: | - set -e - cd test - python3 nni_test/nnitest/generate_ts_config.py \ - --ts pai \ - --pai_reuse false \ - --pai_host https://ne.openpai.org \ - --pai_user $(pai_user) \ - --nni_docker_image nnidev/nni-nightly \ - --pai_storage_config_name confignfs-data \ - --pai_token $(pai_token) \ - --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \ - --container_nfs_mount_path $(container_nfs_mount_path) \ - --nni_manager_ip $(manager_ip) \ - --vc nni - python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai - displayName: Integration test + # - script: | + # set -e + # cd test + # python3 nni_test/nnitest/generate_ts_config.py \ + # --ts pai \ + # --pai_reuse false \ + # --pai_host https://ne.openpai.org \ + # --pai_user $(pai_user) \ + # --nni_docker_image nnidev/nni-nightly \ + # --pai_storage_config_name confignfs-data \ + # --pai_token $(pai_token) \ + # --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \ + # --container_nfs_mount_path $(container_nfs_mount_path) \ + # --nni_manager_ip $(manager_ip) \ + # --vc nni + # python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai + # displayName: Integration test - script: | set -e From f4aaf0d24d2bc09520584d1e8ccc61800aa6f78f Mon Sep 17 00:00:00 2001 From: liuzhe Date: Wed, 11 Aug 2021 11:34:34 +0800 Subject: [PATCH 30/34] . --- pipelines/integration-test-openpai-linux.yml | 34 ++++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pipelines/integration-test-openpai-linux.yml b/pipelines/integration-test-openpai-linux.yml index aab1887386..92bec3548c 100644 --- a/pipelines/integration-test-openpai-linux.yml +++ b/pipelines/integration-test-openpai-linux.yml @@ -53,23 +53,23 @@ jobs: condition: eq(variables['build_docker_image'], 'true') displayName: Build and upload docker image - # - script: | - # set -e - # cd test - # python3 nni_test/nnitest/generate_ts_config.py \ - # --ts pai \ - # --pai_reuse false \ - # --pai_host https://ne.openpai.org \ - # --pai_user $(pai_user) \ - # --nni_docker_image nnidev/nni-nightly \ - # --pai_storage_config_name confignfs-data \ - # --pai_token $(pai_token) \ - # --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \ - # --container_nfs_mount_path $(container_nfs_mount_path) \ - # --nni_manager_ip $(manager_ip) \ - # --vc nni - # python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai - # displayName: Integration test + - script: | + set -e + cd test + python3 nni_test/nnitest/generate_ts_config.py \ + --ts pai \ + --pai_reuse false \ + --pai_host https://ne.openpai.org \ + --pai_user $(pai_user) \ + --nni_docker_image nnidev/nni-nightly \ + --pai_storage_config_name confignfs-data \ + --pai_token $(pai_token) \ + --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \ + --container_nfs_mount_path $(container_nfs_mount_path) \ + --nni_manager_ip $(manager_ip) \ + --vc nni + python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai + displayName: Integration test - script: | set -e From d77e3118918ad524e959025b5d77c5d6eef83d65 Mon Sep 17 00:00:00 2001 From: liuzhe Date: Wed, 11 Aug 2021 12:41:11 +0800 Subject: [PATCH 31/34] version check --- ts/nni_manager/common/experimentConfig.ts | 2 +- ts/nni_manager/training_service/pai/paiTrainingService.ts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ts/nni_manager/common/experimentConfig.ts b/ts/nni_manager/common/experimentConfig.ts index 568eabf5e7..2b69da0dfe 100644 --- a/ts/nni_manager/common/experimentConfig.ts +++ b/ts/nni_manager/common/experimentConfig.ts @@ -228,4 +228,4 @@ export function flattenConfig(config: ExperimentConfig, platform: string): T Object.assign(flattened, config.trainingService); } return flattened; -} \ No newline at end of file +} diff --git a/ts/nni_manager/training_service/pai/paiTrainingService.ts b/ts/nni_manager/training_service/pai/paiTrainingService.ts index 13f0b05b6f..805ab16786 100644 --- a/ts/nni_manager/training_service/pai/paiTrainingService.ts +++ b/ts/nni_manager/training_service/pai/paiTrainingService.ts @@ -70,6 +70,7 @@ class PAITrainingService implements TrainingService { this.paiTokenUpdateInterval = 7200000; //2hours this.log.info('Construct paiBase training service.'); this.config = flattenConfig(config, 'openpai'); + this.versionCheck = !this.config.debug; this.paiJobRestServer = new PAIJobRestServer(this); this.paiToken = this.config.token; this.protocol = this.config.host.toLowerCase().startsWith('https://') ? 'https' : 'http'; From 4ef565826be2f6a245fab4c9a8681e7cdd8fc855 Mon Sep 17 00:00:00 2001 From: liuzhe Date: Wed, 11 Aug 2021 13:42:19 +0800 Subject: [PATCH 32/34] debug --- pipelines/integration-test-openpai-linux.yml | 3 ++- test/nni_test/nnitest/generate_ts_config.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pipelines/integration-test-openpai-linux.yml b/pipelines/integration-test-openpai-linux.yml index 92bec3548c..f9e4bf5434 100644 --- a/pipelines/integration-test-openpai-linux.yml +++ b/pipelines/integration-test-openpai-linux.yml @@ -67,7 +67,8 @@ jobs: --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \ --container_nfs_mount_path $(container_nfs_mount_path) \ --nni_manager_ip $(manager_ip) \ - --vc nni + --vc nni \ + --debug true python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai displayName: Integration test diff --git a/test/nni_test/nnitest/generate_ts_config.py b/test/nni_test/nnitest/generate_ts_config.py index 07c52abf18..98cfaf0fc8 100644 --- a/test/nni_test/nnitest/generate_ts_config.py +++ b/test/nni_test/nnitest/generate_ts_config.py @@ -33,6 +33,8 @@ def update_training_service_config(args): config[args.ts]['trial']['paiStorageConfigName'] = args.pai_storage_config_name if args.vc is not None: config[args.ts]['trial']['virtualCluster'] = args.vc + if args.debug is not None: + config[args.ts]['debug'] = args.debug elif args.ts == 'kubeflow': if args.nfs_server is not None: config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server From a6c955992c92e2d9f68cd4a3b577634ac6f76d80 Mon Sep 17 00:00:00 2001 From: liuzhe Date: Wed, 11 Aug 2021 13:59:21 +0800 Subject: [PATCH 33/34] . --- test/nni_test/nnitest/generate_ts_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/nni_test/nnitest/generate_ts_config.py b/test/nni_test/nnitest/generate_ts_config.py index 98cfaf0fc8..307b74cf31 100644 --- a/test/nni_test/nnitest/generate_ts_config.py +++ b/test/nni_test/nnitest/generate_ts_config.py @@ -148,6 +148,7 @@ def update_training_service_config(args): parser.add_argument("--pai_storage_config_name", type=str) parser.add_argument("--nni_manager_nfs_mount_path", type=str) parser.add_argument("--container_nfs_mount_path", type=str) + parser.add_argument("--debug", type=str) # args for kubeflow and frameworkController parser.add_argument("--nfs_path", type=str) parser.add_argument("--keyvault_vaultname", type=str) From b708612d14e5cd769803bd647391567192f80476 Mon Sep 17 00:00:00 2001 From: liuzhe Date: Wed, 11 Aug 2021 14:05:53 +0800 Subject: [PATCH 34/34] . --- test/nni_test/nnitest/generate_ts_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/nni_test/nnitest/generate_ts_config.py b/test/nni_test/nnitest/generate_ts_config.py index 307b74cf31..99af993e6d 100644 --- a/test/nni_test/nnitest/generate_ts_config.py +++ b/test/nni_test/nnitest/generate_ts_config.py @@ -34,7 +34,7 @@ def update_training_service_config(args): if args.vc is not None: config[args.ts]['trial']['virtualCluster'] = args.vc if args.debug is not None: - config[args.ts]['debug'] = args.debug + config[args.ts]['debug'] = args.debug.lower() == 'true' elif args.ts == 'kubeflow': if args.nfs_server is not None: config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server