Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

merge V1.6 back #2492

Merged
merged 55 commits into from
May 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
704b50e
Merge pull request #200 from microsoft/master
SparkSnail Aug 6, 2019
5b0034e
Merge pull request #204 from microsoft/master
SparkSnail Aug 20, 2019
8fe2588
Merge pull request #205 from microsoft/master
SparkSnail Aug 30, 2019
9fae194
Merge pull request #206 from microsoft/master
SparkSnail Sep 16, 2019
c785655
Merge pull request #207 from microsoft/master
SparkSnail Oct 21, 2019
2f5272c
Merge pull request #208 from microsoft/master
SparkSnail Oct 24, 2019
1892bc2
Merge pull request #209 from microsoft/master
SparkSnail Oct 28, 2019
7c1ab11
Merge pull request #210 from microsoft/master
SparkSnail Oct 28, 2019
8c203f3
Merge pull request #211 from microsoft/master
SparkSnail Oct 31, 2019
d7a62f6
check pylint for nni_cmd
SparkSnail Oct 31, 2019
e259d10
fix id error
SparkSnail Oct 31, 2019
4997295
Merge pull request #212 from microsoft/master
SparkSnail Nov 3, 2019
c037a7c
Merge pull request #213 from microsoft/master
SparkSnail Nov 10, 2019
7620e7c
Merge pull request #214 from microsoft/master
SparkSnail Nov 14, 2019
d16dbe9
Merge pull request #215 from microsoft/master
SparkSnail Nov 19, 2019
9ce751d
Merge pull request #216 from microsoft/master
SparkSnail Nov 21, 2019
a0846f2
Merge pull request #217 from microsoft/master
SparkSnail Nov 22, 2019
cd3a912
Merge pull request #218 from microsoft/master
SparkSnail Nov 27, 2019
32efaa3
Merge pull request #219 from microsoft/master
SparkSnail Dec 10, 2019
543239c
Merge pull request #220 from microsoft/master
SparkSnail Dec 12, 2019
36e6e35
Merge pull request #221 from microsoft/master
SparkSnail Dec 19, 2019
f9ee589
Merge pull request #222 from microsoft/master
SparkSnail Dec 24, 2019
b9a7a95
Merge pull request #223 from microsoft/master
SparkSnail Dec 25, 2019
1a5c017
Merge pull request #224 from microsoft/master
SparkSnail Jan 6, 2020
392460a
Merge pull request #225 from microsoft/master
SparkSnail Jan 8, 2020
9bafa4c
Merge pull request #226 from microsoft/master
SparkSnail Jan 8, 2020
c23b807
Merge pull request #227 from microsoft/master
SparkSnail Jan 10, 2020
4132f62
Merge pull request #228 from microsoft/master
SparkSnail Jan 10, 2020
4f66d0c
Merge pull request #229 from microsoft/master
SparkSnail Feb 1, 2020
129c4a5
Merge pull request #230 from microsoft/master
SparkSnail Feb 4, 2020
3fe117f
Merge pull request #231 from microsoft/master
SparkSnail Feb 7, 2020
aa31674
Merge pull request #233 from microsoft/master
SparkSnail Feb 21, 2020
1d74ae5
Merge pull request #234 from microsoft/master
SparkSnail Feb 27, 2020
75028bd
Merge pull request #235 from microsoft/master
SparkSnail Mar 17, 2020
4773c91
Merge pull request #236 from microsoft/master
SparkSnail Mar 18, 2020
3ee0961
Merge pull request #237 from microsoft/master
SparkSnail Mar 20, 2020
0fb7862
Merge pull request #238 from microsoft/master
SparkSnail Mar 26, 2020
6c3148c
Merge pull request #239 from microsoft/master
SparkSnail Apr 3, 2020
b4773e1
Merge pull request #240 from microsoft/master
SparkSnail Apr 11, 2020
6728799
Merge pull request #241 from microsoft/master
SparkSnail Apr 16, 2020
1b9daa3
Merge pull request #242 from microsoft/master
SparkSnail Apr 20, 2020
e0c2c0e
Merge pull request #243 from microsoft/master
SparkSnail Apr 23, 2020
e29b58a
Merge pull request #244 from microsoft/master
SparkSnail Apr 30, 2020
1e51182
Merge pull request #245 from microsoft/master
SparkSnail May 8, 2020
d90433d
Merge pull request #246 from microsoft/master
SparkSnail May 12, 2020
6568eae
Merge pull request #247 from microsoft/master
SparkSnail May 18, 2020
0fd38de
Merge pull request #248 from microsoft/master
SparkSnail May 19, 2020
0a742af
Merge pull request #249 from microsoft/master
SparkSnail May 19, 2020
f548d82
Merge pull request #250 from microsoft/master
SparkSnail May 19, 2020
241b364
fix pai yaml merge (#2477)
SparkSnail May 25, 2020
3aae9d0
Fix tensorflow import (#2481)
liuzhe-lz May 25, 2020
8d8fcd2
Remove leading letter "v" from version string (#2480)
liuzhe-lz May 25, 2020
e640ad6
Fix storage logic (#2488)
SparkSnail May 25, 2020
be09f11
Improve stablability of remote training service. (#2474)
squirrelsc May 25, 2020
7d5feeb
v1.6 release note (#2491)
SparkSnail May 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ NNI_YARN ?= PATH=$(BIN_FOLDER):$${PATH} $(NNI_YARN_FOLDER)/bin/yarn

## Version number
NNI_VERSION_VALUE = $(shell git describe --tags)
NNI_VERSION_VALUE := $(NNI_VERSION_VALUE:v%=%)
NNI_VERSION_TEMPLATE = 999.0.0-developing

# Main targets
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ The tool manages automated machine learning (AutoML) experiments, **dispatches a
* Researchers and data scientists who want to easily **implement and experiment new AutoML algorithms**, may it be: hyperparameter tuning algorithm, neural architect search algorithm or model compression algorithm.
* ML Platform owners who want to **support AutoML in their platform**.

### **[NNI v1.5 has been released!](https://github.com/microsoft/nni/releases) &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>**
### **[NNI v1.6 has been released!](https://github.com/microsoft/nni/releases) &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>**

## **NNI capabilities in a glance**

Expand Down Expand Up @@ -239,7 +239,7 @@ The following example is built on TensorFlow 1.x. Make sure **TensorFlow 1.x is
* Download the examples via clone the source code.

```bash
git clone -b v1.5 https://github.com/Microsoft/nni.git
git clone -b v1.6 https://github.com/Microsoft/nni.git
```

* Run the MNIST example.
Expand Down
2 changes: 2 additions & 0 deletions deployment/pypi/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ endif

TIME_STAMP = $(shell date -u "+%y%m%d%H%M")
NNI_VERSION_VALUE = $(shell git describe --tags --abbrev=0)
NNI_VERSION_VALUE := $(NNI_VERSION_VALUE:v%=%)

# To include time stamp in version value, run:
# make version_ts=true build
Expand All @@ -25,6 +26,7 @@ NNI_YARN_FOLDER ?= $(CWD)nni-yarn
NNI_YARN := PATH=$(CWD)node-$(OS_SPEC)-x64/bin:$${PATH} $(NNI_YARN_FOLDER)/bin/yarn
.PHONY: build
build:
# Building version $(NNI_VERSION_VALUE)
python3 -m pip install --user --upgrade setuptools wheel
wget -q https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(CWD)node-$(OS_SPEC)-x64.tar.xz
rm -rf $(CWD)node-$(OS_SPEC)-x64
Expand Down
1 change: 1 addition & 0 deletions deployment/pypi/install.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ else{

$TIME_STAMP = date -u "+%y%m%d%H%M"
$NNI_VERSION_VALUE = git describe --tags --abbrev=0
$NNI_VERSION_VALUE = $NNI_VERSION_VALUE.substring(1)

# To include time stamp in version value, run:
# make version_ts=true build
Expand Down
41 changes: 41 additions & 0 deletions docs/en_US/Release.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,46 @@
# ChangeLog

## Release 1.6 - 5/26/2020

### Major Features

#### New Features and improvement
* Improve IPC limitation to 100W
* improve code storage upload logic among trials in non-local platform
* support `__version__` for SDK version
* support windows dev intall

#### Web UI
* Show trial error message
* finalize homepage layout
* Refactor overview's best trials module
* Remove multiphase from webui
* add tooltip for trial concurrency in the overview page
* Show top trials for hyper-parameter graph

#### HPO Updates
* Improve PBT on failure handling and support experiment resume for PBT

#### NAS Updates
* NAS support for TensorFlow 2.0 (preview) [TF2.0 NAS examples](https://github.com/microsoft/nni/tree/master/examples/nas/naive-tf)
* Use OrderedDict for LayerChoice
* Prettify the format of export
* Replace layer choice with selected module after applied fixed architecture

#### Model Compression Updates
* Model compression PyTorch 1.4 support

#### Training Service Updates
* update pai yaml merge logic
* support windows as remote machine in remote mode [Remote Mode](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/RemoteMachineMode.md#windows)

### Bug Fix
* fix dev install
* SPOS example crash when the checkpoints do not have state_dict
* Fix table sort issue when experiment had failed trial
* Support multi python env (conda, pyenv etc)


## Release 1.5 - 4/13/2020

### New Features and Documentation
Expand Down
1 change: 0 additions & 1 deletion examples/nas/enas-tf/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# Licensed under the MIT license.

import tensorflow as tf
from tensorflow.data import Dataset

def get_dataset():
(x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.cifar10.load_data()
Expand Down
2 changes: 1 addition & 1 deletion src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ class NNIManager implements Manager {
assert(this.status.status === 'RUNNING' ||
this.status.status === 'DONE' ||
this.status.status === 'NO_MORE_TRIAL' ||
this.status.status === 'TUNER_NO_MORE_TRIAL');
this.status.status === 'TUNER_NO_MORE_TRIAL', `Actual status: ${this.status.status}`);
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
if (this.status.status !== 'DONE') {
Expand Down
8 changes: 4 additions & 4 deletions src/nni_manager/training_service/kubernetes/kubernetesData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ export NNI_EXP_ID={4}
export NNI_CODE_DIR={5}
export NNI_TRIAL_SEQ_ID={6}
{7}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_SYS_DIR/code
mkdir -p $NNI_OUTPUT_DIR
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code
sh $NNI_SYS_DIR/install_nni.sh
cd $NNI_SYS_DIR/code
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
--nni_manager_version '{11}' --log_collection '{12}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`;
Original file line number Diff line number Diff line change
Expand Up @@ -477,16 +477,14 @@ class LocalTrainingService implements TrainingService {
private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
const script: string[] = [];
if (process.platform === 'win32') {
script.push(`Copy-Item $env:NNI_CODE_DIR\\* -Destination $env:NNI_SYS_DIR -Recurse`);
script.push(`cd $env:NNI_SYS_DIR`);
script.push(`cd $env:NNI_CODE_DIR`);
script.push(
`cmd.exe /c ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File "${path.join(workingDirectory, '.nni', 'state')}" -NoNewline -encoding utf8`);
} else {
script.push(`cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR`);
script.push(`cd $NNI_SYS_DIR`);
script.push(`cd $NNI_CODE_DIR`);
script.push(`eval ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`);
if (process.platform === 'darwin') {
// https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x
Expand Down
4 changes: 2 additions & 2 deletions src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ fi`;

export const PAI_K8S_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& NNI_CODE_DIR={6} && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR && cd $NNI_SYS_DIR && sh install_nni.sh \
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
&& NNI_CODE_DIR={6} && mkdir -p $NNI_SYS_DIR/code && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code && sh $NNI_SYS_DIR/install_nni.sh \
&& cd $NNI_SYS_DIR/code && python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}'`;
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ const yaml = require('js-yaml');
class PAIK8STrainingService extends PAITrainingService {
protected paiTrialConfig: NNIPAIK8STrialConfig | undefined;
private copyExpCodeDirPromise?: Promise<void>;
private paiJobConfig: undefined;
private paiJobConfig: any;
private nniVersion: string | undefined;
constructor() {
super();
Expand Down Expand Up @@ -190,7 +190,7 @@ class PAIK8STrainingService extends PAITrainingService {

let nniJobConfig: any = undefined;
if (this.paiTrialConfig.paiConfigPath) {
nniJobConfig = this.paiJobConfig;
nniJobConfig = JSON.parse(JSON.stringify(this.paiJobConfig)); //Trick for deep clone in Typescript
nniJobConfig.name = jobName;
// Each taskRole will generate new command in NNI's command format
// Each command will be formatted to NNI style
Expand Down Expand Up @@ -290,8 +290,6 @@ class PAIK8STrainingService extends PAITrainingService {
await this.writeParameterFile(trialJobDetail.logPath, trialJobDetail.form.hyperParameters);
}

//Copy codeDir files to local working folder
await execCopydir(this.paiTrialConfig.codeDir, trialJobDetail.logPath);
//Generate Job Configuration in yaml format
const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail);
this.log.debug(paiJobConfig);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ class LinuxCommands extends OsCommands {
export NNI_PLATFORM=remote NNI_SYS_DIR=${workingDirectory} NNI_OUTPUT_DIR=${workingDirectory} NNI_TRIAL_JOB_ID=${trialJobId} \
NNI_EXP_ID=${experimentId} NNI_TRIAL_SEQ_ID=${trialSequenceId} NNI_CODE_DIR=${codeDir}
export MULTI_PHASE=${isMultiPhase}

cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
mkdir -p $NNI_SYS_DIR/code
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code
sh $NNI_SYS_DIR/install_nni.sh
cd $NNI_SYS_DIR/code
python3 -m nni_trial_tool.trial_keeper --trial_command '${cudaVisibleSetting} ${command}' --nnimanager_ip '${nniManagerAddress}' \
--nnimanager_port '${nniManagerPort}' --nni_manager_version '${nniManagerVersion}' \
--job_id_file ${jobIdFileName} \
Expand Down Expand Up @@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands {
return result;
}

public killChildProcesses(pidFileName: string): string {
public killChildProcesses(pidFileName: string, killSelf: boolean): string {
// prevent trialkeeper to be killed, so it can save exit code.
const command = `list_descendants ()
let command = `list_descendants ()
{
local children=$(ps -o pid= --ppid "$1")

Expand All @@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands {
echo "$children"
}
kill $(list_descendants \`cat '${pidFileName}'\`)`
if (killSelf) {
command += `\nkill \`cat '${pidFileName}'\``
}
return command;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ class WindowsCommands extends OsCommands {
set MULTI_PHASE=${isMultiPhase}
set NNI_CODE_DIR=${codeDir}
${cudaVisibleSetting !== "" ? "set " + cudaVisibleSetting : ""}

robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR%
cd %NNI_SYS_DIR%
md %NNI_SYS_DIR%/code
robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR%/code
cd %NNI_SYS_DIR%/code
python -c "import nni" 2>nul
if not %ERRORLEVEL% EQU 0 (
echo installing NNI as exit code of "import nni" is %ERRORLEVEL%
Expand Down Expand Up @@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands {
return result;
}

public killChildProcesses(pidFileName: string): string {
const command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
public killChildProcesses(pidFileName: string, killSelf: boolean): string {
let command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
`Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; ` +
`if ($subppid -ne $ppid){Stop-Process -Id $subppid}}` +
`if ($subppid -ne $ppid){Stop-Process -Id $subppid -Force"}}` +
`kill-tree $ppid"`;
if (killSelf){
command += `;Stop-Process -Id $ppid`;
}
return command;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ abstract class OsCommands {
public abstract readLastLines(fileName: string, lineCount: number): string;
public abstract isProcessAliveCommand(pidFileName: string): string;
public abstract isProcessAliveProcessOutput(result: RemoteCommandResult): boolean;
public abstract killChildProcesses(pidFileName: string): string;
public abstract killChildProcesses(pidFileName: string, killSelf: boolean): string;
public abstract extractFile(tarFileName: string, targetFolder: string): string;
public abstract executeScript(script: string, isFile: boolean): string;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService {
}
}
if (restServer.getErrorMessage !== undefined) {
throw new Error(restServer.getErrorMessage);
this.stopping = true;
throw new Error(restServer.getErrorMessage);
}
await delay(3000);
}
Expand Down Expand Up @@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService {
if (executor !== undefined) {
this.log.info(`killing gpu metric collector on ${executor.name}`);
const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid');
await executor.killChildProcesses(gpuJobPidPath);
await executor.killChildProcesses(gpuJobPidPath, true);
}
executorManager.releaseAllExecutor();
}
Expand Down Expand Up @@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService {
this.timer.unsubscribe(disposable);
}
}
if (this.stopping){
this.timer.unsubscribe(disposable);
this.log.debug(`Stopped GPU collector on ${rmMeta.ip}, since experiment is exiting.`);
}
collectingCount.pop();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,8 @@ class ShellExecutor {
return result !== undefined ? result : false;
}

public async killChildProcesses(pidFileName: string): Promise<boolean> {
const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName);
public async killChildProcesses(pidFileName: string, killSelf: boolean = false): Promise<boolean> {
const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName, killSelf);
const commandResult = await this.execute(commandText);
return commandResult.exitCode == 0;
}
Expand Down
13 changes: 6 additions & 7 deletions src/sdk/pynni/nni/nas/tensorflow/enas/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import logging

import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras.optimizers import Adam

from nni.nas.tensorflow.utils import AverageMeterGroup, fill_zero_grads
Expand Down Expand Up @@ -39,9 +38,9 @@ def __init__(self, model, loss, metrics, reward_function, optimizer, batch_size,

x, y = dataset_train
split = int(len(x) * 0.9)
self.train_set = Dataset.from_tensor_slices((x[:split], y[:split]))
self.valid_set = Dataset.from_tensor_slices((x[split:], y[split:]))
self.test_set = Dataset.from_tensor_slices(dataset_valid)
self.train_set = tf.data.Dataset.from_tensor_slices((x[:split], y[:split]))
self.valid_set = tf.data.Dataset.from_tensor_slices((x[split:], y[split:]))
self.test_set = tf.data.Dataset.from_tensor_slices(dataset_valid)

self.mutator = EnasMutator(model)
self.mutator_optim = Adam(learning_rate=mutator_lr)
Expand Down Expand Up @@ -151,9 +150,9 @@ def validate_one_epoch(self, epoch):


def _create_train_loader(self):
train_set = self.train_set.shuffle(1000000).batch(self.batch_size)
test_set = self.test_set.shuffle(1000000).batch(self.batch_size)
train_set = self.train_set.shuffle(1000000).repeat().batch(self.batch_size)
test_set = self.test_set.shuffle(1000000).repeat().batch(self.batch_size)
return iter(train_set), iter(test_set)

def _create_validate_loader(self):
return iter(self.test_set.shuffle(1000000).batch(self.batch_size))
return iter(self.test_set.shuffle(1000000).repeat().batch(self.batch_size))
24 changes: 1 addition & 23 deletions tools/nni_gpu_tool/gpu_metrics_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,9 @@
from xml.dom import minidom


def check_ready_to_run():
if sys.platform == 'win32':
pgrep_output = subprocess.check_output(
'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
pidList = pgrep_output.decode("utf-8").strip().split()
pidList.pop(0) # remove the key word 'ProcessId'
pidList = list(map(int, pidList))
pidList.remove(os.getpid())
return not pidList
else:
pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pid = pid.decode()
if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
continue
pidList.append(pid)
return not pidList


def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
if check_ready_to_run() == False:
print("GPU metrics collector is already running. exiting...")
exit(2)

cmd = 'nvidia-smi -q -x'.split()
while(True):
try:
Expand Down