From 619177b9965ee975b9f38a1f1bdfcceda49efcef Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 13 Sep 2021 10:35:05 +0800 Subject: [PATCH] [Retiarii] Remove unused code and enrich integration tests (#4097) --- nni/retiarii/evaluator/pytorch/__init__.py | 1 - nni/retiarii/evaluator/pytorch/base.py | 305 --------------------- nni/retiarii/experiment/__init__.py | 0 nni/retiarii/experiment/pytorch.py | 25 +- test/scripts/nas.sh | 14 +- 5 files changed, 15 insertions(+), 330 deletions(-) delete mode 100644 nni/retiarii/evaluator/pytorch/base.py create mode 100644 nni/retiarii/experiment/__init__.py diff --git a/nni/retiarii/evaluator/pytorch/__init__.py b/nni/retiarii/evaluator/pytorch/__init__.py index c35431e91a..76da136a76 100644 --- a/nni/retiarii/evaluator/pytorch/__init__.py +++ b/nni/retiarii/evaluator/pytorch/__init__.py @@ -1,2 +1 @@ -from .base import PyTorchImageClassificationTrainer, PyTorchMultiModelTrainer from .lightning import * diff --git a/nni/retiarii/evaluator/pytorch/base.py b/nni/retiarii/evaluator/pytorch/base.py deleted file mode 100644 index 62d25b4f74..0000000000 --- a/nni/retiarii/evaluator/pytorch/base.py +++ /dev/null @@ -1,305 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -# This file is deprecated. - -import abc -from typing import Any, List, Dict, Tuple - -import numpy as np -import torch -import torch.nn as nn -from torch.utils.data import DataLoader -from torchvision import datasets, transforms - -import nni - -class BaseTrainer(abc.ABC): - @abc.abstractmethod - def fit(self) -> None: - pass - - -def get_default_transform(dataset: str) -> Any: - """ - To get a default transformation of image for a specific dataset. - This is needed because transform objects can not be directly passed as arguments. - - Parameters - ---------- - dataset : str - Dataset class name. - - Returns - ------- - transform object - """ - if dataset == 'MNIST': - return transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ]) - if dataset == 'CIFAR10': - return transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), - (0.2023, 0.1994, 0.2010)), - ]) - # unsupported dataset, return None - return None - - -class PyTorchImageClassificationTrainer(BaseTrainer): - """ - Image classification trainer for PyTorch. - - A model, along with corresponding dataset, optimizer config is used to initialize the trainer. - The trainer will run for a fixed number of epochs (by default 10), and report the final result. - - TODO - Support scheduler, validate every n epochs, train/valid dataset - - Limitation induced by NNI: kwargs must be serializable to put into a JSON packed in parameters. - """ - - def __init__(self, model, - dataset_cls='MNIST', dataset_kwargs=None, dataloader_kwargs=None, - optimizer_cls='SGD', optimizer_kwargs=None, trainer_kwargs=None): - """Initialization of image classification trainer. - - Parameters - ---------- - model : nn.Module - Model to train. - dataset_cls : str, optional - Dataset class name that is available in ``torchvision.datasets``, by default 'MNIST' - dataset_kwargs : dict, optional - Keyword arguments passed to initialization of dataset class, by default None - dataset_kwargs : dict, optional - Keyword arguments passed to ``torch.utils.data.DataLoader``, by default None - optimizer_cls : str, optional - Optimizer class name that is available in ``torch.optim``, by default 'SGD' - optimizer_kwargs : dict, optional - Keyword arguments passed to initialization of optimizer class, by default None - trainer_kwargs: dict, optional - Keyword arguments passed to trainer. Will be passed to Trainer class in future. Currently, - only the key ``max_epochs`` is useful. - """ - super().__init__() - self._use_cuda = torch.cuda.is_available() - self.model = model - if self._use_cuda: - self.model.cuda() - self._loss_fn = nn.CrossEntropyLoss() - self._train_dataset = getattr(datasets, dataset_cls)(train=True, transform=get_default_transform(dataset_cls), - **(dataset_kwargs or {})) - self._val_dataset = getattr(datasets, dataset_cls)(train=False, transform=get_default_transform(dataset_cls), - **(dataset_kwargs or {})) - self._optimizer = getattr(torch.optim, optimizer_cls)(model.parameters(), **(optimizer_kwargs or {})) - self._trainer_kwargs = trainer_kwargs or {'max_epochs': 10} - - self._train_dataloader = DataLoader(self._train_dataset, **(dataloader_kwargs or {})) - self._val_dataloader = DataLoader(self._val_dataset, **(dataloader_kwargs or {})) - - def _accuracy(self, input, target): # pylint: disable=redefined-builtin - _, predict = torch.max(input.data, 1) - correct = predict.eq(target.data).cpu().sum().item() - return correct / input.size(0) - - def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> Dict[str, Any]: - x, y = self.training_step_before_model(batch, batch_idx) - y_hat = self.model(x) - return self.training_step_after_model(x, y, y_hat) - - def training_step_before_model(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int): - x, y = batch - if self._use_cuda: - x, y = x.cuda(torch.device('cuda:0')), y.cuda(torch.device('cuda:0')) - return x, y - - def training_step_after_model(self, x, y, y_hat): - loss = self._loss_fn(y_hat, y) - return loss - - def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> Dict[str, Any]: - x, y = self.validation_step_before_model(batch, batch_idx) - y_hat = self.model(x) - return self.validation_step_after_model(x, y, y_hat) - - def validation_step_before_model(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int): - x, y = batch - if self._use_cuda: - x, y = x.cuda(), y.cuda() - return x, y - - def validation_step_after_model(self, x, y, y_hat): - acc = self._accuracy(y_hat, y) - return {'val_acc': acc} - - def validation_epoch_end(self, outputs: List[Dict[str, Any]]) -> Dict[str, Any]: - # We might need dict metrics in future? - avg_acc = np.mean([x['val_acc'] for x in outputs]).item() - nni.report_intermediate_result(avg_acc) - return {'val_acc': avg_acc} - - def _validate(self): - validation_outputs = [] - for i, batch in enumerate(self._val_dataloader): - validation_outputs.append(self.validation_step(batch, i)) - return self.validation_epoch_end(validation_outputs) - - def _train(self): - for i, batch in enumerate(self._train_dataloader): - self._optimizer.zero_grad() - loss = self.training_step(batch, i) - loss.backward() - self._optimizer.step() - - def fit(self) -> None: - for _ in range(self._trainer_kwargs['max_epochs']): - self._train() - self._validate() - # assuming val_acc here - nni.report_final_result(self._validate()['val_acc']) - - -class PyTorchMultiModelTrainer(BaseTrainer): - def __init__(self, multi_model, kwargs=[]): - self.multi_model = multi_model - self.kwargs = kwargs - self._train_dataloaders = [] - self._train_datasets = [] - self._val_dataloaders = [] - self._val_datasets = [] - self._optimizers = [] - self._trainers = [] - self._loss_fn = nn.CrossEntropyLoss() - self.max_steps = self.kwargs['max_steps'] if 'makx_steps' in self.kwargs else None - self.n_model = len(self.kwargs['model_kwargs']) - - for m in self.kwargs['model_kwargs']: - if m['use_input']: - dataset_cls = m['dataset_cls'] - dataset_kwargs = m['dataset_kwargs'] - dataloader_kwargs = m['dataloader_kwargs'] - train_dataset = getattr(datasets, dataset_cls)(train=True, transform=get_default_transform(dataset_cls), - **(dataset_kwargs or {})) - val_dataset = getattr(datasets, dataset_cls)(train=False, transform=get_default_transform(dataset_cls), - **(dataset_kwargs or {})) - train_dataloader = DataLoader(train_dataset, **(dataloader_kwargs or {})) - val_dataloader = DataLoader(val_dataset, **(dataloader_kwargs or {})) - self._train_datasets.append(train_dataset) - self._train_dataloaders.append(train_dataloader) - - self._val_datasets.append(val_dataset) - self._val_dataloaders.append(val_dataloader) - - if m['use_output']: - optimizer_cls = m['optimizer_cls'] - optimizer_kwargs = m['optimizer_kwargs'] - m_header = f"M_{m['model_id']}" - one_model_params = [] - for name, param in multi_model.named_parameters(): - name_prefix = '_'.join(name.split('_')[:2]) - if m_header == name_prefix: - one_model_params.append(param) - - optimizer = getattr(torch.optim, optimizer_cls)(one_model_params, **(optimizer_kwargs or {})) - self._optimizers.append(optimizer) - - def fit(self) -> None: - torch.autograd.set_detect_anomaly(True) - max_epochs = max([x['trainer_kwargs']['max_epochs'] for x in self.kwargs['model_kwargs']]) - for _ in range(max_epochs): - self._train() - self._validate() - nni.report_final_result(self._validate()) - - def _train(self): - for batch_idx, multi_model_batch in enumerate(zip(*self._train_dataloaders)): - for opt in self._optimizers: - opt.zero_grad() - xs = [] - ys = [] - for idx, batch in enumerate(multi_model_batch): - x, y = self.training_step_before_model(batch, batch_idx, f'cuda:{idx}') - xs.append(x) - ys.append(y) - - y_hats = self.multi_model(*xs) - if len(ys) != len(xs): - raise ValueError('len(ys) should be equal to len(xs)') - losses = [] - report_loss = {} - for output_idx, yhat in enumerate(y_hats): - if len(ys) == len(y_hats): - loss = self.training_step_after_model(xs[output_idx], ys[output_idx], yhat) - elif len(ys) == 1: - loss = self.training_step_after_model(xs[0], ys[0].to(yhat.get_device()), yhat) - else: - raise ValueError('len(ys) should be either 1 or len(y_hats)') - losses.append(loss.to("cuda:0")) - report_loss[self.kwargs['model_kwargs'][output_idx]['model_id']] = loss.item() - summed_loss = sum(losses) - summed_loss.backward() - for opt in self._optimizers: - opt.step() - if self.max_steps and batch_idx >= self.max_steps: - return - - def training_step_before_model(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int, device=None): - x, y = batch - if device: - x, y = x.cuda(torch.device(device)), y.cuda(torch.device(device)) - return x, y - - def training_step_after_model(self, x, y, y_hat): - loss = self._loss_fn(y_hat, y) - return loss - - def _validate(self): - all_val_outputs = {idx: [] for idx in range(self.n_model)} - for batch_idx, multi_model_batch in enumerate(zip(*self._val_dataloaders)): - xs = [] - ys = [] - for idx, batch in enumerate(multi_model_batch): - x, y = self.training_step_before_model(batch, batch_idx, f'cuda:{idx}') - xs.append(x) - ys.append(y) - if len(ys) != len(xs): - raise ValueError('len(ys) should be equal to len(xs)') - - y_hats = self.multi_model(*xs) - - for output_idx, yhat in enumerate(y_hats): - if len(ys) == len(y_hats): - acc = self.validation_step_after_model(xs[output_idx], ys[output_idx], yhat) - elif len(ys) == 1: - acc = self.validation_step_after_model(xs[0], ys[0].to(yhat.get_device()), yhat) - else: - raise ValueError('len(ys) should be either 1 or len(y_hats)') - all_val_outputs[output_idx].append(acc) - - report_acc = {} - for idx in all_val_outputs: - avg_acc = np.mean([x['val_acc'] for x in all_val_outputs[idx]]).item() - report_acc[self.kwargs['model_kwargs'][idx]['model_id']] = avg_acc - nni.report_intermediate_result(report_acc) - return report_acc - - def validation_step_before_model(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int, device=None): - x, y = batch - if device: - x, y = x.cuda(torch.device(device)), y.cuda(torch.device(device)) - return x, y - - def validation_step_after_model(self, x, y, y_hat): - acc = self._accuracy(y_hat, y) - return {'val_acc': acc} - - def _accuracy(self, input, target): # pylint: disable=redefined-builtin - _, predict = torch.max(input.data, 1) - correct = predict.eq(target.data).cpu().sum().item() - return correct / input.size(0) diff --git a/nni/retiarii/experiment/__init__.py b/nni/retiarii/experiment/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/nni/retiarii/experiment/pytorch.py b/nni/retiarii/experiment/pytorch.py index b8db5f989c..ee17b1f7ba 100644 --- a/nni/retiarii/experiment/pytorch.py +++ b/nni/retiarii/experiment/pytorch.py @@ -3,41 +3,38 @@ import atexit import logging +import os +import socket import time from dataclasses import dataclass -import os from pathlib import Path -import socket from subprocess import Popen from threading import Thread -import time from typing import Any, List, Optional, Union import colorama import psutil - import torch import torch.nn as nn import nni.runtime.log -from nni.experiment import Experiment, TrainingServiceConfig -from nni.experiment import management, launcher, rest +from nni.common.device import GPUDevice +from nni.experiment import Experiment, TrainingServiceConfig, launcher, management, rest from nni.experiment.config import util from nni.experiment.config.base import ConfigBase, PathLike from nni.experiment.pipe import Pipe from nni.tools.nnictl.command_utils import kill_command -from nni.common.device import GPUDevice from ..codegen import model_to_pytorch_script from ..converter import convert_to_graph from ..converter.graph_gen import GraphConverterWithShape from ..execution import list_models, set_execution_engine from ..execution.python import get_mutation_dict -from ..graph import Model, Evaluator +from ..graph import Evaluator from ..integration import RetiariiAdvisor from ..mutator import Mutator -from ..nn.pytorch.mutator import process_inline_mutation, extract_mutation_from_pt_module -from ..strategy import BaseStrategy +from ..nn.pytorch.mutator import extract_mutation_from_pt_module, process_inline_mutation from ..oneshot.interface import BaseOneShotTrainer +from ..strategy import BaseStrategy _logger = logging.getLogger(__name__) @@ -73,7 +70,7 @@ def __init__(self, training_service_platform: Optional[str] = None, **kwargs): super().__init__(**kwargs) if training_service_platform is not None: assert 'training_service' not in kwargs - self.training_service = util.training_service_config_factory(platform = training_service_platform) + self.training_service = util.training_service_config_factory(platform=training_service_platform) self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry py' def __setattr__(self, key, value): @@ -117,6 +114,7 @@ def _validation_rules(self): 'training_service': lambda value: (type(value) is not TrainingServiceConfig, 'cannot be abstract base class') } + def preprocess_model(base_model, trainer, applied_mutators, full_ir=True, dummy_input=None): # TODO: this logic might need to be refactored into execution engine if full_ir: @@ -220,6 +218,7 @@ def start(self, port: int = 8080, debug: bool = False) -> None: engine = BaseExecutionEngine() elif self.config.execution_engine == 'cgo': from ..execution.cgo_engine import CGOExecutionEngine + # assert self.config.trial_gpu_number==1, "trial_gpu_number must be 1 to use CGOExecutionEngine" assert self.config.batch_waiting_time is not None devices = self._construct_devices() @@ -273,14 +272,14 @@ def start(self, port: int = 8080, debug: bool = False) -> None: def _construct_devices(self): devices = [] if hasattr(self.config.training_service, 'machine_list'): - for machine_idx, machine in enumerate(self.config.training_service.machine_list): + for machine in self.config.training_service.machine_list: for gpu_idx in machine.gpu_indices: devices.append(GPUDevice(machine.host, gpu_idx)) else: for gpu_idx in self.config.training_service.gpu_indices: devices.append(GPUDevice('local', gpu_idx)) return devices - + def _create_dispatcher(self): return self._dispatcher diff --git a/test/scripts/nas.sh b/test/scripts/nas.sh index 5267fcfe98..b941acd118 100644 --- a/test/scripts/nas.sh +++ b/test/scripts/nas.sh @@ -6,17 +6,9 @@ echo "" echo "===========================Testing: NAS===========================" EXAMPLE_DIR=${CWD}/../examples/nas -echo "testing nnictl ss_gen (classic nas)..." -cd $EXAMPLE_DIR/legacy/classic_nas -SEARCH_SPACE_JSON=nni_auto_gen_search_space.json -if [ -f $SEARCH_SPACE_JSON ]; then - rm $SEARCH_SPACE_JSON -fi -nnictl ss_gen -t "python3 mnist.py" -if [ ! -f $SEARCH_SPACE_JSON ]; then - echo "Search space file not found!" - exit 1 -fi +echo "testing mnist..." +cd $EXAMPLE_DIR/multi-trial/mnist +python3 search.py echo "testing darts..." cd $EXAMPLE_DIR/oneshot/darts