Merge pull request #251 from microsoft/master

Merge master
SparkSnail · May 29, 2020 · dcd2ffd · dcd2ffd
2 parents f548d82 + 5a911b3
commit dcd2ffd
Show file tree

Hide file tree

Showing 36 changed files with 121 additions and 88 deletions.
diff --git a/Makefile b/Makefile
@@ -60,6 +60,7 @@ NNI_YARN ?= PATH=$(BIN_FOLDER):$${PATH} $(NNI_YARN_FOLDER)/bin/yarn
 
 ## Version number
 NNI_VERSION_VALUE = $(shell git describe --tags)
+NNI_VERSION_VALUE := $(NNI_VERSION_VALUE:v%=%)
 NNI_VERSION_TEMPLATE = 999.0.0-developing
 
 # Main targets

diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ The tool manages automated machine learning (AutoML) experiments, **dispatches a
 * Researchers and data scientists who want to easily **implement and experiment new AutoML algorithms**, may it be: hyperparameter tuning algorithm, neural architect search algorithm or model compression algorithm.
 * ML Platform owners who want to **support AutoML in their platform**.
 
-### **[NNI v1.5 has been released!](https://github.com/microsoft/nni/releases) &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>**
+### **[NNI v1.6 has been released!](https://github.com/microsoft/nni/releases) &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>**
 
 ## **NNI capabilities in a glance**
 
@@ -239,7 +239,7 @@ The following example is built on TensorFlow 1.x. Make sure **TensorFlow 1.x is
 * Download the examples via clone the source code.
 
   ```bash
-  git clone -b v1.5 https://github.com/Microsoft/nni.git
+  git clone -b v1.6 https://github.com/Microsoft/nni.git
   ```
 
 * Run the MNIST example.
@@ -319,8 +319,7 @@ After getting familiar with contribution agreements, you are ready to create you
 With authors' permission, we listed a set of NNI usage examples and relevant articles.
 
 * ### **External Repositories** ###
-   * Run [ENAS](examples/tuners/enas_nni/README.md) with NNI
-   * Run [Neural Network Architecture Search](examples/trials/nas_cifar10/README.md) with NNI
+   * Run [ENAS](examples/nas/enas/README.md) with NNI
    * [Automatic Feature Engineering](examples/feature_engineering/auto-feature-engineering/README.md) with NNI
    * [Hyperparameter Tuning for Matrix Factorization](https://github.com/microsoft/recommenders/blob/master/notebooks/04_model_select_and_optimize/nni_surprise_svd.ipynb) with NNI
    * [scikit-nni](https://github.com/ksachdeva/scikit-nni) Hyper-parameter search for scikit-learn pipelines using NNI
@@ -342,7 +341,7 @@ With authors' permission, we listed a set of NNI usage examples and relevant art
 Join IM discussion groups:
 |Gitter||WeChat|
 |----|----|----|
-|![image](https://user-images.githubusercontent.com/39592018/80665738-e0574a80-8acc-11ea-91bc-0836dc4cbf89.png)| OR |![image](https://github.com/JSong-Jia/NNI-user-group/blob/master/user%20group%20code_0512.jpg)|
+|<img src="https://user-images.githubusercontent.com/39592018/80665738-e0574a80-8acc-11ea-91bc-0836dc4cbf89.png" width="180"/>| OR |<img src="https://user-images.githubusercontent.com/39592018/83108240-113d9600-a0f2-11ea-91f8-8754af11a0ee.png" width="180"/>|
 
 
 ## Related Projects

diff --git a/deployment/pypi/Makefile b/deployment/pypi/Makefile
@@ -13,6 +13,7 @@ endif
 
 TIME_STAMP = $(shell date -u "+%y%m%d%H%M")
 NNI_VERSION_VALUE = $(shell git describe --tags --abbrev=0)
+NNI_VERSION_VALUE := $(NNI_VERSION_VALUE:v%=%)
 
 # To include time stamp in version value, run:
 # make version_ts=true build
@@ -25,6 +26,7 @@ NNI_YARN_FOLDER ?= $(CWD)nni-yarn
 NNI_YARN := PATH=$(CWD)node-$(OS_SPEC)-x64/bin:$${PATH} $(NNI_YARN_FOLDER)/bin/yarn
 .PHONY: build
 build:
+	# Building version $(NNI_VERSION_VALUE)
 	python3 -m pip install --user --upgrade setuptools wheel
 	wget -q https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(CWD)node-$(OS_SPEC)-x64.tar.xz
 	rm -rf $(CWD)node-$(OS_SPEC)-x64

diff --git a/deployment/pypi/install.ps1 b/deployment/pypi/install.ps1
@@ -15,6 +15,7 @@ else{
 
 $TIME_STAMP = date -u "+%y%m%d%H%M"
 $NNI_VERSION_VALUE = git describe --tags --abbrev=0
+$NNI_VERSION_VALUE = $NNI_VERSION_VALUE.substring(1)
 
 # To include time stamp in version value, run:
 # make version_ts=true build

diff --git a/docs/en_US/Compressor/ModelSpeedup.md b/docs/en_US/Compressor/ModelSpeedup.md
@@ -21,7 +21,7 @@ For each module, we should prepare four functions, three for shape inference and
 ## Usage
 
 ```python
-from nni.compression.speedup.torch import ModelSpeedup
+from nni.compression.torch import ModelSpeedup
 # model: the model you want to speed up
 # dummy_input: dummy input of the model, given to `jit.trace`
 # masks_file: the mask file created by pruning algorithms

diff --git a/docs/en_US/Release.md b/docs/en_US/Release.md
@@ -1,5 +1,46 @@
 # ChangeLog
 
+## Release 1.6 - 5/26/2020
+
+### Major Features
+
+#### New Features and improvement
+* Improve IPC limitation to 100W
+* improve code storage upload logic among trials in non-local platform
+* support `__version__` for SDK version
+* support windows dev intall
+
+#### Web UI
+* Show trial error message
+* finalize homepage layout
+* Refactor overview's best trials module
+* Remove multiphase from webui
+* add tooltip for trial concurrency in the overview page
+* Show top trials for hyper-parameter graph
+
+#### HPO Updates
+* Improve PBT on failure handling and support experiment resume for PBT
+
+#### NAS Updates
+* NAS support for TensorFlow 2.0 (preview) [TF2.0 NAS examples](https://github.com/microsoft/nni/tree/master/examples/nas/naive-tf)
+* Use OrderedDict for LayerChoice
+* Prettify the format of export
+* Replace layer choice with selected module after applied fixed architecture
+
+#### Model Compression Updates
+* Model compression PyTorch 1.4 support
+
+#### Training Service Updates
+* update pai yaml merge logic
+* support windows as remote machine in remote mode [Remote Mode](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/RemoteMachineMode.md#windows)
+
+### Bug Fix
+* fix dev install
+* SPOS example crash when the checkpoints do not have state_dict
+* Fix table sort issue when experiment had failed trial
+* Support multi python env (conda, pyenv etc)
+
+
 ## Release 1.5 - 4/13/2020
 
 ### New Features and Documentation

diff --git a/examples/model_compress/model_speedup.py b/examples/model_compress/model_speedup.py
@@ -6,8 +6,7 @@
 import torch.nn.functional as F
 from torchvision import datasets, transforms
 from models.cifar10.vgg import VGG
-from nni.compression.speedup.torch import ModelSpeedup
-from nni.compression.torch import apply_compression_results
+from nni.compression.torch import apply_compression_results, ModelSpeedup
 
 torch.manual_seed(0)
 use_mask = True

diff --git a/examples/nas/enas-tf/datasets.py b/examples/nas/enas-tf/datasets.py
@@ -2,7 +2,6 @@
 # Licensed under the MIT license.
 
 import tensorflow as tf
-from tensorflow.data import Dataset
 
 def get_dataset():
     (x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.cifar10.load_data()

diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts
@@ -566,7 +566,7 @@ class NNIManager implements Manager {
             assert(this.status.status === 'RUNNING' ||
                 this.status.status === 'DONE' ||
                 this.status.status === 'NO_MORE_TRIAL' ||
-                this.status.status === 'TUNER_NO_MORE_TRIAL');
+                this.status.status === 'TUNER_NO_MORE_TRIAL', `Actual status: ${this.status.status}`);
             if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
                 this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
                 if (this.status.status !== 'DONE') {

diff --git a/src/nni_manager/training_service/kubernetes/kubernetesData.ts b/src/nni_manager/training_service/kubernetes/kubernetesData.ts
@@ -47,10 +47,10 @@ export NNI_EXP_ID={4}
 export NNI_CODE_DIR={5}
 export NNI_TRIAL_SEQ_ID={6}
 {7}
-mkdir -p $NNI_SYS_DIR
+mkdir -p $NNI_SYS_DIR/code
 mkdir -p $NNI_OUTPUT_DIR
-cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
-cd $NNI_SYS_DIR
-sh install_nni.sh
+cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code
+sh $NNI_SYS_DIR/install_nni.sh
+cd $NNI_SYS_DIR/code
 python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
 --nni_manager_version '{11}' --log_collection '{12}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`;
diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts
@@ -477,16 +477,14 @@ class LocalTrainingService implements TrainingService {
     private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
         const script: string[] = [];
         if (process.platform === 'win32') {
-            script.push(`Copy-Item $env:NNI_CODE_DIR\\* -Destination $env:NNI_SYS_DIR -Recurse`);
-            script.push(`cd $env:NNI_SYS_DIR`);
+            script.push(`cd $env:NNI_CODE_DIR`);
             script.push(
                 `cmd.exe /c ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`,
                 `$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
                 `$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
                 `Write $LASTEXITCODE " " $NOW_DATE  | Out-File "${path.join(workingDirectory, '.nni', 'state')}" -NoNewline -encoding utf8`);
         } else {
-            script.push(`cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR`);
-            script.push(`cd $NNI_SYS_DIR`);
+            script.push(`cd $NNI_CODE_DIR`);
             script.push(`eval ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`);
             if (process.platform === 'darwin') {
                 // https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x

diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
@@ -31,6 +31,6 @@ fi`;
 
 export const PAI_K8S_TRIAL_COMMAND_FORMAT: string =
 `export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
-&& NNI_CODE_DIR={6} && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR && cd $NNI_SYS_DIR && sh install_nni.sh \
-&& python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
+&& NNI_CODE_DIR={6} && mkdir -p $NNI_SYS_DIR/code && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code && sh $NNI_SYS_DIR/install_nni.sh \
+&& cd $NNI_SYS_DIR/code && python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
 --nni_manager_version '{10}' --log_collection '{11}'`;
diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
@@ -54,7 +54,7 @@ const yaml = require('js-yaml');
 class PAIK8STrainingService extends PAITrainingService {
     protected paiTrialConfig: NNIPAIK8STrialConfig | undefined;
     private copyExpCodeDirPromise?: Promise<void>;
-    private paiJobConfig: undefined;
+    private paiJobConfig: any;
     private nniVersion: string | undefined;
     constructor() {
         super();
@@ -190,7 +190,7 @@ class PAIK8STrainingService extends PAITrainingService {
 
         let nniJobConfig: any = undefined;
         if (this.paiTrialConfig.paiConfigPath) {
-            nniJobConfig = this.paiJobConfig;
+            nniJobConfig = JSON.parse(JSON.stringify(this.paiJobConfig)); //Trick for deep clone in Typescript
             nniJobConfig.name = jobName;
             // Each taskRole will generate new command in NNI's command format
             // Each command will be formatted to NNI style
@@ -290,8 +290,6 @@ class PAIK8STrainingService extends PAITrainingService {
             await this.writeParameterFile(trialJobDetail.logPath, trialJobDetail.form.hyperParameters);
         }
 
-        //Copy codeDir files to local working folder
-        await execCopydir(this.paiTrialConfig.codeDir, trialJobDetail.logPath);
         //Generate Job Configuration in yaml format
         const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail);
         this.log.debug(paiJobConfig);

diff --git a/src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts b/src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
@@ -22,10 +22,10 @@ class LinuxCommands extends OsCommands {
             export NNI_PLATFORM=remote NNI_SYS_DIR=${workingDirectory} NNI_OUTPUT_DIR=${workingDirectory} NNI_TRIAL_JOB_ID=${trialJobId} \
             NNI_EXP_ID=${experimentId} NNI_TRIAL_SEQ_ID=${trialSequenceId} NNI_CODE_DIR=${codeDir}
             export MULTI_PHASE=${isMultiPhase}
-
-            cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
-            cd $NNI_SYS_DIR
-            sh install_nni.sh
+            mkdir -p $NNI_SYS_DIR/code
+            cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code
+            sh $NNI_SYS_DIR/install_nni.sh
+            cd $NNI_SYS_DIR/code
             python3 -m nni_trial_tool.trial_keeper --trial_command '${cudaVisibleSetting} ${command}' --nnimanager_ip '${nniManagerAddress}' \
                 --nnimanager_port '${nniManagerPort}' --nni_manager_version '${nniManagerVersion}' \
                 --job_id_file ${jobIdFileName} \
@@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands {
         return result;
     }
 
-    public killChildProcesses(pidFileName: string): string {
+    public killChildProcesses(pidFileName: string, killSelf: boolean): string {
         // prevent trialkeeper to be killed, so it can save exit code.
-        const command = `list_descendants ()
+        let command = `list_descendants ()
                 {
                 local children=$(ps -o pid= --ppid "$1")
 
@@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands {
                 echo "$children"
                 }
             kill $(list_descendants \`cat '${pidFileName}'\`)`
+        if (killSelf) {
+            command += `\nkill \`cat '${pidFileName}'\``
+        }
         return command;
     }
 

diff --git a/src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts b/src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
@@ -28,9 +28,9 @@ class WindowsCommands extends OsCommands {
             set MULTI_PHASE=${isMultiPhase}
             set NNI_CODE_DIR=${codeDir}
             ${cudaVisibleSetting !== "" ? "set " + cudaVisibleSetting : ""}
-
-            robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR%
-            cd %NNI_SYS_DIR%
+            md %NNI_SYS_DIR%/code
+            robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR%/code
+            cd %NNI_SYS_DIR%/code
             python -c "import nni" 2>nul
             if not %ERRORLEVEL% EQU 0 (
                 echo installing NNI as exit code of "import nni" is %ERRORLEVEL%
@@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands {
         return result;
     }
 
-    public killChildProcesses(pidFileName: string): string {
-        const command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
+    public killChildProcesses(pidFileName: string, killSelf: boolean): string {
+        let command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
             `Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; ` +
-            `if ($subppid -ne $ppid){Stop-Process -Id $subppid}}` +
+            `if ($subppid -ne $ppid){Stop-Process -Id $subppid -Force"}}` +
             `kill-tree $ppid"`;
+        if (killSelf){
+            command += `;Stop-Process -Id $ppid`;
+        }
         return command;
     }
 

diff --git a/src/nni_manager/training_service/remote_machine/osCommands.ts b/src/nni_manager/training_service/remote_machine/osCommands.ts
@@ -25,7 +25,7 @@ abstract class OsCommands {
     public abstract readLastLines(fileName: string, lineCount: number): string;
     public abstract isProcessAliveCommand(pidFileName: string): string;
     public abstract isProcessAliveProcessOutput(result: RemoteCommandResult): boolean;
-    public abstract killChildProcesses(pidFileName: string): string;
+    public abstract killChildProcesses(pidFileName: string, killSelf: boolean): string;
     public abstract extractFile(tarFileName: string, targetFolder: string): string;
     public abstract executeScript(script: string, isFile: boolean): string;
 

diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
@@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService {
                 }
             }
             if (restServer.getErrorMessage !== undefined) {
-                throw new Error(restServer.getErrorMessage);
                 this.stopping = true;
+                throw new Error(restServer.getErrorMessage);
             }
             await delay(3000);
         }
@@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService {
                 if (executor !== undefined) {
                     this.log.info(`killing gpu metric collector on ${executor.name}`);
                     const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid');
-                    await executor.killChildProcesses(gpuJobPidPath);
+                    await executor.killChildProcesses(gpuJobPidPath, true);
                 }
                 executorManager.releaseAllExecutor();
             }
@@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService {
                             this.timer.unsubscribe(disposable);
                         }
                     }
+                    if (this.stopping){
+                        this.timer.unsubscribe(disposable);
+                        this.log.debug(`Stopped GPU collector on ${rmMeta.ip}, since experiment is exiting.`);
+                    }
                     collectingCount.pop();
                 }
             }

diff --git a/src/nni_manager/training_service/remote_machine/shellExecutor.ts b/src/nni_manager/training_service/remote_machine/shellExecutor.ts
@@ -230,8 +230,8 @@ class ShellExecutor {
         return result !== undefined ? result : false;
     }
 
-    public async killChildProcesses(pidFileName: string): Promise<boolean> {
-        const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName);
+    public async killChildProcesses(pidFileName: string, killSelf: boolean = false): Promise<boolean> {
+        const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName, killSelf);
         const commandResult = await this.execute(commandText);
         return commandResult.exitCode == 0;
     }

diff --git a/src/sdk/pynni/nni/compression/torch/__init__.py b/src/sdk/pynni/nni/compression/torch/__init__.py
@@ -1,10 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+from .pruning import *
+from .quantization import *
 from .compressor import Compressor, Pruner, Quantizer
-from .pruners import *
-from .weight_rank_filter_pruners import *
-from .activation_rank_filter_pruners import *
-from .quantizers import *
-from .apply_compression import apply_compression_results
-from .gradient_rank_filter_pruners import *
+from .speedup import ModelSpeedup
diff --git a/src/sdk/pynni/nni/compression/torch/pruning/__init__.py b/src/sdk/pynni/nni/compression/torch/pruning/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .pruners import *
+from .weight_rank_filter_pruners import *
+from .activation_rank_filter_pruners import *
+from .apply_compression import apply_compression_results
+from .gradient_rank_filter_pruners import *
diff --git a/...n/torch/activation_rank_filter_pruners.py → ...pruning/activation_rank_filter_pruners.py b/...n/torch/activation_rank_filter_pruners.py → ...pruning/activation_rank_filter_pruners.py
@@ -4,8 +4,8 @@
 import logging
 import torch
 from schema import And, Optional
-from .utils import CompressorSchema
-from .compressor import Pruner
+from ..utils.config_validation import CompressorSchema
+from ..compressor import Pruner
 
 __all__ = ['ActivationAPoZRankFilterPruner', 'ActivationMeanRankFilterPruner']
 

diff --git a/...ni/compression/torch/apply_compression.py → ...ession/torch/pruning/apply_compression.py b/...ni/compression/torch/apply_compression.py → ...ession/torch/pruning/apply_compression.py
diff --git a/...ion/torch/gradient_rank_filter_pruners.py → ...h/pruning/gradient_rank_filter_pruners.py b/...ion/torch/gradient_rank_filter_pruners.py → ...h/pruning/gradient_rank_filter_pruners.py
@@ -3,7 +3,7 @@
 
 import logging
 import torch
-from .compressor import Pruner
+from ..compressor import Pruner
 
 __all__ = ['TaylorFOWeightFilterPruner']
 

diff --git a/...dk/pynni/nni/compression/torch/pruners.py → .../nni/compression/torch/pruning/pruners.py b/...dk/pynni/nni/compression/torch/pruners.py → .../nni/compression/torch/pruning/pruners.py
@@ -5,8 +5,8 @@
 import logging
 import torch
 from schema import And, Optional
-from .compressor import Pruner
-from .utils import CompressorSchema
+from ..utils.config_validation import CompressorSchema
+from ..compressor import Pruner
 
 __all__ = ['LevelPruner', 'AGP_Pruner', 'SlimPruner', 'LotteryTicketPruner']