diff --git a/pipelines/integration-test-openpai-linux.yml b/pipelines/integration-test-openpai-linux.yml index a574a6faea..f9e4bf5434 100644 --- a/pipelines/integration-test-openpai-linux.yml +++ b/pipelines/integration-test-openpai-linux.yml @@ -64,10 +64,11 @@ jobs: --nni_docker_image nnidev/nni-nightly \ --pai_storage_config_name confignfs-data \ --pai_token $(pai_token) \ - --nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \ - --container_nfs_mount_path /mnt/confignfs-data/shinyang3 \ + --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \ + --container_nfs_mount_path $(container_nfs_mount_path) \ --nni_manager_ip $(manager_ip) \ - --vc nni + --vc nni \ + --debug true python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai displayName: Integration test @@ -82,8 +83,8 @@ jobs: --nni_docker_image nnidev/nni-nightly \ --pai_storage_config_name confignfs-data \ --pai_token $(pai_token) \ - --nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \ - --container_nfs_mount_path /mnt/confignfs-data/shinyang3 \ + --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \ + --container_nfs_mount_path $(container_nfs_mount_path) \ --nni_manager_ip $(manager_ip) \ --vc nni python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai diff --git a/test/nni_test/nnitest/generate_ts_config.py b/test/nni_test/nnitest/generate_ts_config.py index 07c52abf18..99af993e6d 100644 --- a/test/nni_test/nnitest/generate_ts_config.py +++ b/test/nni_test/nnitest/generate_ts_config.py @@ -33,6 +33,8 @@ def update_training_service_config(args): config[args.ts]['trial']['paiStorageConfigName'] = args.pai_storage_config_name if args.vc is not None: config[args.ts]['trial']['virtualCluster'] = args.vc + if args.debug is not None: + config[args.ts]['debug'] = args.debug.lower() == 'true' elif args.ts == 'kubeflow': if args.nfs_server is not None: config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server @@ -146,6 +148,7 @@ def update_training_service_config(args): parser.add_argument("--pai_storage_config_name", type=str) parser.add_argument("--nni_manager_nfs_mount_path", type=str) parser.add_argument("--container_nfs_mount_path", type=str) + parser.add_argument("--debug", type=str) # args for kubeflow and frameworkController parser.add_argument("--nfs_path", type=str) parser.add_argument("--keyvault_vaultname", type=str) diff --git a/ts/nni_manager/common/experimentConfig.ts b/ts/nni_manager/common/experimentConfig.ts index 568eabf5e7..2b69da0dfe 100644 --- a/ts/nni_manager/common/experimentConfig.ts +++ b/ts/nni_manager/common/experimentConfig.ts @@ -228,4 +228,4 @@ export function flattenConfig(config: ExperimentConfig, platform: string): T Object.assign(flattened, config.trainingService); } return flattened; -} \ No newline at end of file +} diff --git a/ts/nni_manager/training_service/pai/paiTrainingService.ts b/ts/nni_manager/training_service/pai/paiTrainingService.ts index 89fbbf4f07..805ab16786 100644 --- a/ts/nni_manager/training_service/pai/paiTrainingService.ts +++ b/ts/nni_manager/training_service/pai/paiTrainingService.ts @@ -70,6 +70,7 @@ class PAITrainingService implements TrainingService { this.paiTokenUpdateInterval = 7200000; //2hours this.log.info('Construct paiBase training service.'); this.config = flattenConfig(config, 'openpai'); + this.versionCheck = !this.config.debug; this.paiJobRestServer = new PAIJobRestServer(this); this.paiToken = this.config.token; this.protocol = this.config.host.toLowerCase().startsWith('https://') ? 'https' : 'http'; @@ -78,7 +79,7 @@ class PAITrainingService implements TrainingService { private async copyTrialCode(): Promise { await validateCodeDir(this.config.trialCodeDirectory); - const nniManagerNFSExpCodeDir = path.join(this.config.trialCodeDirectory, this.experimentId, 'nni-code'); + const nniManagerNFSExpCodeDir = path.join(this.config.localStorageMountPoint, this.experimentId, 'nni-code'); await execMkdir(nniManagerNFSExpCodeDir); this.log.info(`Starting copy codeDir data from ${this.config.trialCodeDirectory} to ${nniManagerNFSExpCodeDir}`); await execCopydir(this.config.trialCodeDirectory, nniManagerNFSExpCodeDir); diff --git a/ts/nni_manager/training_service/reusable/routerTrainingService.ts b/ts/nni_manager/training_service/reusable/routerTrainingService.ts index 5e22c55495..ac681b943a 100644 --- a/ts/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/ts/nni_manager/training_service/reusable/routerTrainingService.ts @@ -26,11 +26,11 @@ class RouterTrainingService implements TrainingService { const instance = new RouterTrainingService(); instance.log = getLogger('RouterTrainingService'); const platform = Array.isArray(config.trainingService) ? 'hybrid' : config.trainingService.platform; - if (platform === 'remote' && !(config.trainingService).reuseMode) { + if (platform === 'remote' && (config.trainingService).reuseMode === false) { instance.internalTrainingService = new RemoteMachineTrainingService(config); - } else if (platform === 'openpai' && !(config.trainingService).reuseMode) { + } else if (platform === 'openpai' && (config.trainingService).reuseMode === false) { instance.internalTrainingService = new PAITrainingService(config); - } else if (platform === 'kubeflow' && !(config.trainingService).reuseMode) { + } else if (platform === 'kubeflow' && (config.trainingService).reuseMode === false) { instance.internalTrainingService = new KubeflowTrainingService(); } else { instance.internalTrainingService = await TrialDispatcher.construct(config);