Skip to content

Commit

Permalink
Add simple hardware switch functionalty.
Browse files Browse the repository at this point in the history
  • Loading branch information
bethune-bryant committed Jul 29, 2024
1 parent 261faf7 commit ca650ba
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 10 deletions.
21 changes: 13 additions & 8 deletions gpustat/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,15 @@
from blessed import Terminal

from gpustat import util
from gpustat import rocml as nvml
from gpustat import rocml as N
from gpustat.rocml import check_driver_nvml_version

if util.hasNvidia():
from gpustat import nvml
from gpustat.nvml import nvml as N
from gpustat.nvml import check_driver_nvml_version
else:
from gpustat import rocml as nvml
from gpustat import rocml as N
from gpustat.rocml import check_driver_nvml_version

NOT_SUPPORTED = 'Not Supported'
MB = 1024 * 1024
Expand Down Expand Up @@ -555,7 +561,6 @@ def _wrapped(*args, **kwargs):
processes = []
nv_comp_processes = nv_comp_processes or []
nv_graphics_processes = nv_graphics_processes or []
print(nv_comp_processes)
# A single process might run in both of graphics and compute mode,
# However we will display the process only once
seen_pids = set()
Expand Down Expand Up @@ -611,10 +616,10 @@ def _wrapped(*args, **kwargs):
gpu_stat = GPUStat(gpu_info)
except Exception as e:
gpu_stat = InvalidGPU(index, "((Unknown Error))", e)
#except N.NVMLError_Unknown as e:
# gpu_stat = InvalidGPU(index, "((Unknown Error))", e)
#except N.NVMLError_GpuIsLost as e:
# gpu_stat = InvalidGPU(index, "((GPU is lost))", e)
except N.NVMLError_Unknown as e:
gpu_stat = InvalidGPU(index, "((Unknown Error))", e)
except N.NVMLError_GpuIsLost as e:
gpu_stat = InvalidGPU(index, "((GPU is lost))", e)

if isinstance(gpu_stat, InvalidGPU):
log.add_exception("GPU %d" % index, gpu_stat.exception)
Expand Down
13 changes: 11 additions & 2 deletions gpustat/rocml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Imports pynvml with sanity checks and custom patches."""
"""Imports pyrsmi and wraps it in a pynvml compatible interface."""

# pylint: disable=protected-access

Expand All @@ -16,10 +16,19 @@

NVML_TEMPERATURE_GPU = 1

class NVMLError_Unknown(Exception):
def __init__(self, message="An unknown ROCMLError has occurred"):
self.message = message
super().__init__(self.message)

class NVMLError_GpuIsLost(Exception):
def __init__(self, message="ROCM Device is lost."):
self.message = message
super().__init__(self.message)

def nvmlDeviceGetCount():
return rocml.smi_get_device_count()


def nvmlDeviceGetHandleByIndex(dev):
return dev

Expand Down
9 changes: 9 additions & 0 deletions gpustat/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import collections
import os.path
import subprocess
import sys
import traceback
from typing import Callable, Tuple, Type, TypeVar, Union
Expand Down Expand Up @@ -101,3 +102,11 @@ def report_summary(self, concise=True):
self._write("{msg} -> Total {value} occurrences.".format(
msg=msg, value=value))
self._write('')


def hasNvidia():
try:
subprocess.check_output('nvidia-smi')
return True
except Exception:
return False

0 comments on commit ca650ba

Please sign in to comment.