Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

[Retiarii] Retry a failed multi-model trial by disabling CGO in CGOExecutionEngine #4098

Merged
merged 20 commits into from
Oct 11, 2021
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 37 additions & 7 deletions nni/common/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,69 @@
# Licensed under the MIT license.

from dataclasses import dataclass
from abc import ABC, abstractmethod

try:
from typing import Literal
except ImportError:
from typing_extensions import Literal


@dataclass
class GPUDevice:
class Device(ABC):
node_id: str
gpu_id: int
status: Literal['idle', 'busy', 'unknown'] = 'idle'

def __eq__(self, o) -> bool:
if type(self) == type(o):
return self.node_id == o.node_id
else:
return False

def __lt__(self, o) -> bool:
return self.node_id < o.node_id

def set_status(self, status):
self.status = status

def __repr__(self) -> str:
return "{Abstract Device %s, Status %s}" % (self.node_id, self.status)

@abstractmethod
def device_repr(self) -> str:
pass


@dataclass
class GPUDevice(Device):
gpu_id: str = -1

def __init__(self, node_id, gpu_id, status='idle'):
self.node_id = node_id
self.gpu_id = gpu_id
self.status = status

def __eq__(self, o: Device) -> bool:
if isinstance(o, GPUDevice):
return self.node_id == o.node_id and self.gpu_id == o.gpu_id
return False

def __lt__(self, o) -> bool:
def __lt__(self, o: Device) -> bool:
if self.node_id < o.node_id:
return True
elif self.node_id > o.node_id:
return False
else:
return self.gpu_id < o.gpu_id
if isinstance(o, GPUDevice):
return self.gpu_id < o.gpu_id
else:
return True

def __repr__(self) -> str:
return "{Environment %s, GPU %d, Status %s}" % (self.node_id, self.gpu_id, self.status)

def __hash__(self) -> int:
return hash(self.node_id + '_' + str(self.gpu_id))

def set_status(self, status):
self.status = status

def device_repr(self,):
return f"cuda:{self.gpu_id}"
37 changes: 34 additions & 3 deletions nni/retiarii/codegen/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
# Licensed under the MIT license.

import logging
from typing import List, Tuple, Any
from typing import Dict, List, Tuple, Any

from nni.retiarii.operation_def.torch_op_def import ToDevice
from nni.common.device import Device, GPUDevice

from ..graph import IllegalGraphError, Edge, Graph, Node, Model

Expand Down Expand Up @@ -70,7 +73,7 @@ def _format_inputs(node: Node) -> Tuple[List[str], List[Any]]:
# when the input comes from a single-output operator
inputs.append('{}'.format(edge.head.name))
if edge.head.operation.type in ('prim::Constant', 'prim::GetAttr') and \
'value' in edge.head.operation.parameters:
'value' in edge.head.operation.parameters:
inputs_value.append(edge.head.operation.parameters['value'])
else:
inputs_value.append(None)
Expand Down Expand Up @@ -98,15 +101,39 @@ def _remove_prefix(names, graph_name):
return names[len(graph_name):] if names.startswith(graph_name) else names


def generate_cuda_mapping(placement: Dict[Node, Device]) -> Dict[Device, int]:
'''
Since CUDA_VISIBLE_DEVICES will be set to the list of real GPU ID,
we need to remap the GPU ID when generating code to match them correctly.
For example, when CUDA_VISIBLE_DEVICES="0,3", we need to use "cuda:0", "cuda:1" in the generated code.
'''
unique_devices = sorted(list(set([e for e in placement.values() if isinstance(e, GPUDevice)])))
node_gpu_cnt = {}
cuda_remapped_id = {}
for d in unique_devices:
if d.node_id not in node_gpu_cnt:
node_gpu_cnt[d.node_id] = 0
node_gpu_cnt[d.node_id] += 1
cuda_remapped_id[d] = node_gpu_cnt[d.node_id] - 1

return cuda_remapped_id


def graph_to_pytorch_model(graph_name: str, graph: Graph, placement=None) -> str:
nodes = graph.topo_sort()

# handle module node and function node differently
# only need to generate code for module here
import_pkgs = set()
node_codes = []
cuda_remapped_id = None
if placement:
cuda_remapped_id = generate_cuda_mapping(placement)
for node in nodes:
if node.operation:
if placement and isinstance(node.operation, ToDevice):
node.operation.override_device_repr("cuda:%d" % cuda_remapped_id[node.operation.device])

if node.operation.type == 'shared':
continue
pkg_name = node.operation.get_import_pkg()
Expand All @@ -115,7 +142,11 @@ def graph_to_pytorch_model(graph_name: str, graph: Graph, placement=None) -> str
node_code = node.operation.to_init_code(_remove_prefix(node.name, graph_name))
if node_code is not None:
if placement and node in placement and len(node_code) > 0:
node_codes.append(f"{node_code}.to('{placement[node].device_repr()}')")
if isinstance(placement[node], GPUDevice):
device_repr = "cuda:%d" % cuda_remapped_id[placement[node]]
else:
device_repr = placement[node].device_repr()
node_codes.append(f"{node_code}.to('{device_repr}')")
else:
node_codes.append(node_code)

Expand Down
Loading