Fix AdaptDL Intergration (#3153)

microsoft · Dec 9, 2020 · dbb2434 · dbb2434
1 parent 765206c
commit dbb2434
Show file tree

Hide file tree

Showing 9 changed files with 39 additions and 29 deletions.
diff --git a/.gitignore b/.gitignore
@@ -81,6 +81,7 @@ typings/
 __pycache__
 build
 *.egg-info
+.eggs/
 setup.pye
 **/__init__.pye
 **/.ipynb_checkpoints

diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@
 
 **NNI (Neural Network Intelligence)** is a lightweight but powerful toolkit to help users **automate** <a href="docs/en_US/FeatureEngineering/Overview.md">Feature Engineering</a>, <a href="docs/en_US/NAS/Overview.md">Neural Architecture Search</a>, <a href="docs/en_US/Tuner/BuiltinTuner.md">Hyperparameter Tuning</a> and <a href="docs/en_US/Compression/Overview.md">Model Compression</a>.
 
-The tool manages automated machine learning (AutoML) experiments, **dispatches and runs** experiments' trial jobs generated by tuning algorithms to search the best neural architecture and/or hyper-parameters in **different training environments** like <a href="docs/en_US/TrainingService/LocalMode.md">Local Machine</a>, <a href="docs/en_US/TrainingService/RemoteMachineMode.md">Remote Servers</a>, <a href="docs/en_US/TrainingService/PaiMode.md">OpenPAI</a>, <a href="docs/en_US/TrainingService/KubeflowMode.md">Kubeflow</a>, <a href="docs/en_US/TrainingService/FrameworkControllerMode.md">FrameworkController on K8S (AKS etc.)</a>, <a href="docs/en_US/TrainingService/DLTSMode.md">DLWorkspace (aka. DLTS)</a>, <a href="docs/en_US/TrainingService/AMLMode.md">AML (Azure Machine Learning)</a> and other cloud options.
+The tool manages automated machine learning (AutoML) experiments, **dispatches and runs** experiments' trial jobs generated by tuning algorithms to search the best neural architecture and/or hyper-parameters in **different training environments** like <a href="docs/en_US/TrainingService/LocalMode.md">Local Machine</a>, <a href="docs/en_US/TrainingService/RemoteMachineMode.md">Remote Servers</a>, <a href="docs/en_US/TrainingService/PaiMode.md">OpenPAI</a>, <a href="docs/en_US/TrainingService/KubeflowMode.md">Kubeflow</a>, <a href="docs/en_US/TrainingService/FrameworkControllerMode.md">FrameworkController on K8S (AKS etc.)</a>, <a href="docs/en_US/TrainingService/DLTSMode.md">DLWorkspace (aka. DLTS)</a>, <a href="docs/en_US/TrainingService/AMLMode.md">AML (Azure Machine Learning)</a>, <a href="docs/en_US/TrainingService/AdaptDLMode.md">AdaptDL (aka. ADL)</a> and other cloud options.
 
 ## **Who should consider using NNI**
 
@@ -173,11 +173,13 @@ Within the following table, we summarized the current NNI capabilities, we are g
         <li><a href="docs/en_US/TrainingService/RemoteMachineMode.md">Remote Servers</a></li>
         <li><a href="docs/en_US/TrainingService/AMLMode.md">AML(Azure Machine Learning)</a></li>
         <li><b>Kubernetes based services</b></li>
-            <ul><li><a href="docs/en_US/TrainingService/PaiMode.md">OpenPAI</a></li>
-            <li><a href="docs/en_US/TrainingService/KubeflowMode.md">Kubeflow</a></li>
-            <li><a href="docs/en_US/TrainingService/FrameworkControllerMode.md">FrameworkController on K8S (AKS etc.)</a></li>
-            </ul>
-            <ul><li><a href="docs/en_US/TrainingService/DLTSMode.md">DLWorkspace (aka. DLTS)</a></li>
+        <ul>
+          <li><a href="docs/en_US/TrainingService/PaiMode.md">OpenPAI</a></li>
+          <li><a href="docs/en_US/TrainingService/KubeflowMode.md">Kubeflow</a></li>
+          <li><a href="docs/en_US/TrainingService/FrameworkControllerMode.md">FrameworkController on K8S (AKS etc.)</a></li>
+          <li><a href="docs/en_US/TrainingService/DLTSMode.md">DLWorkspace (aka. DLTS)</a></li>
+          <li><a href="docs/en_US/TrainingService/AdaptDLMode.md">AdaptDL (aka. ADL)</a></li>
+        </ul>
       </ul>
       </td>
     </tr>

diff --git a/docs/en_US/TrainingService/AdaptDLMode.md b/docs/en_US/TrainingService/AdaptDLMode.md
@@ -66,7 +66,7 @@ trial:
     path: /
     containerMountPath: /nfs
   checkpoint: # optional
-    storageClass: microk8s-hostpath
+    storageClass: dfs
     storageSize: 1Gi
 ```
 
@@ -79,18 +79,21 @@ IP address of the machine with NNI manager (NNICTL) that launches NNI experiment
 * **logCollection**: *Recommended* to set as `http`. It will collect the trial logs on cluster back to your machine via http.
 * **tuner**: It supports the Tuun tuner and all NNI built-in tuners (only except for the checkpoint feature of the NNI PBT tuners).
 * **trial**: It defines the specs of an `adl` trial.
-    * **adaptive**: (*Optional*) Boolean for AdaptDL trainer. While `true`, it the job is preemptible and adaptive.
-    * **image**: Docker image for the trial
-    * **imagePullSecret**: (*Optional*) If you are using a private registry,
-    you need to provide the secret to successfully pull the image.
-    * **codeDir**: the working directory of the container. `.` means the default working directory defined by the image.
-    * **command**: the bash command to start the trial
-    * **gpuNum**: the number of GPUs requested for this trial. It must be non-negative integer.
-    * **cpuNum**: (*Optional*) the number of CPUs requested for this trial.  It must be non-negative integer.
-    * **memorySize**: (*Optional*) the size of memory requested for this trial. It must follow the Kubernetes
-    [default format](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory).
-    * **nfs**: (*Optional*) mounting external storage. For more information about using NFS please check the below paragraph.
-    * **checkpoint** (*Optional*) [storage settings](https://kubernetes.io/docs/concepts/storage/storage-classes/) for AdaptDL internal checkpoints. You can keep it optional if you are not dev users.
+  * **adaptive**: (*Optional*) Boolean for AdaptDL trainer. While `true`, it the job is preemptible and adaptive.
+  * **image**: Docker image for the trial
+  * **imagePullSecret**: (*Optional*) If you are using a private registry,
+  you need to provide the secret to successfully pull the image.
+  * **codeDir**: the working directory of the container. `.` means the default working directory defined by the image.
+  * **command**: the bash command to start the trial
+  * **gpuNum**: the number of GPUs requested for this trial. It must be non-negative integer.
+  * **cpuNum**: (*Optional*) the number of CPUs requested for this trial.  It must be non-negative integer.
+  * **memorySize**: (*Optional*) the size of memory requested for this trial. It must follow the Kubernetes
+  [default format](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory).
+  * **nfs**: (*Optional*) mounting external storage. For more information about using NFS please check the below paragraph.
+  * **checkpoint**: (*Optional*) storage settings for model checkpoints.
+    * **storageClass**: check [Kubernetes storage documentation](https://kubernetes.io/docs/concepts/storage/storage-classes/) for how to use the appropriate `storageClass`.
+    * **storageSize**: this value should be large enough to fit your model's checkpoints, or it could cause disk quota exceeded error.
+
 
 ### NFS Storage
 

diff --git a/examples/trials/cifar10_pytorch/config_adl.yml b/examples/trials/cifar10_pytorch/config_adl.yml
@@ -17,10 +17,13 @@ tuner:
     #choice: maximize, minimize
     optimize_mode: maximize
 trial:
-  command: python3 main_adl.py
-  codeDir: .
+  command: python3 /cifar10/main_adl.py
+  codeDir: /cifar10
   gpuNum: 1
   image: {replace_with_the_image_that_has_adaptdl_installed}
+  # optional
+  imagePullSecrets:
+    - name: {secret}
   adaptive: true
   checkpoint:
     storageClass: dfs

diff --git a/examples/trials/cifar10_pytorch/main_adl.py b/examples/trials/cifar10_pytorch/main_adl.py
@@ -146,7 +146,7 @@ def valid(epoch):
         writer.add_scalar("Accuracy/Valid", stats["accuracy"], epoch)
 
         if adaptdl.env.replica_rank() == 0:
-            nni.report_intermediate_result(stats["accuracy"], accum=stats)
+            nni.report_intermediate_result(stats["accuracy"])
 
         print("Valid:", stats)
         return stats["accuracy"]

diff --git a/nni/tools/nnictl/launcher_utils.py b/nni/tools/nnictl/launcher_utils.py
@@ -63,14 +63,16 @@ def parse_path(experiment_config, config_path):
     if experiment_config['trial'].get('paiConfigPath'):
         expand_path(experiment_config['trial'], 'paiConfigPath')
 
-    #if users use relative path, convert it to absolute path
+    # If users use relative path, convert it to absolute path.
     root_path = os.path.dirname(config_path)
     if experiment_config.get('searchSpacePath'):
         parse_relative_path(root_path, experiment_config, 'searchSpacePath')
     if experiment_config.get('logDir'):
         parse_relative_path(root_path, experiment_config, 'logDir')
     if experiment_config.get('trial'):
-        parse_relative_path(root_path, experiment_config['trial'], 'codeDir')
+        # In AdaptDL mode, 'codeDir' shouldn't be parsed because it points to the path in the container.
+        if experiment_config.get('trainingServicePlatform') != 'adl':
+            parse_relative_path(root_path, experiment_config['trial'], 'codeDir')
         if experiment_config['trial'].get('authFile'):
             parse_relative_path(root_path, experiment_config['trial'], 'authFile')
         if experiment_config['trial'].get('ps'):

diff --git a/nni/tools/nnictl/tensorboard_utils.py b/nni/tools/nnictl/tensorboard_utils.py
@@ -134,7 +134,6 @@ def start_tensorboard(args):
     if experiment_dict[args.id]["status"] == "STOPPED":
         print_error("Experiment {} is stopped...".format(args.id))
         return
-    config_file_name = experiment_dict[experiment_id]['fileName']
     nni_config = Config(args.id)
     if nni_config.get_config('experimentConfig').get('trainingServicePlatform') == 'adl':
         adl_tensorboard_helper(args)

diff --git a/ts/nni_manager/training_service/kubernetes/adl/adlTrainingService.ts b/ts/nni_manager/training_service/kubernetes/adl/adlTrainingService.ts
@@ -214,10 +214,10 @@ class AdlTrainingService extends KubernetesTrainingService implements Kubernetes
             trialJobId, form, codeDir, outputDir)
         const cleanupScriptTemplate: string =
 `#!/bin/bash
-ps aux | grep "python3 -m nni_trial_tool.trial_keeper" | awk '{print $2}' | xargs kill -2
+ps aux | grep "python3 -m nni.tools.trial_tool.trial_keeper" | awk '{print $2}' | xargs kill -2
 while true;
 do
-    proc=\`ps aux | grep "python3 -m nni_trial_tool.trial_keeper" | awk '{print $2}' | grep "" -c\`
+    proc=\`ps aux | grep "python3 -m nni.tools.trial_tool.trial_keeper" | awk '{print $2}' | grep "" -c\`
     if (( $proc == 1  )); then
         exit 0
     else
@@ -281,7 +281,7 @@ export NNI_TRIAL_SEQ_ID={4}
 mkdir -p $NNI_OUTPUT_DIR
 {5}
 echo '{6}' > $NNI_CODE_DIR/{7}
-python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' \
+python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{8}' \
 --nnimanager_ip {9} --nnimanager_port {10} \
 --nni_manager_version '{11}' --log_collection '{12}'
 `;

diff --git a/ts/nni_manager/training_service/pai/paiYarn/paiYarnData.ts b/ts/nni_manager/training_service/pai/paiYarn/paiYarnData.ts
@@ -16,7 +16,7 @@ fi`;
 export const PAI_TRIAL_COMMAND_FORMAT: string =
 `export NNI_PLATFORM=paiYarn NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
 && cd $NNI_SYS_DIR && sh install_nni.sh \
-&& python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \
+&& python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \
 --pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \
 --nni_manager_version '{13}' --log_collection '{14}'`;