Skip to content

Commit

Permalink
Switching to amdsmi
Browse files Browse the repository at this point in the history
  • Loading branch information
bethune-bryant committed Jul 31, 2024
1 parent 2c9aadf commit cc2d0f0
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 25 deletions.
21 changes: 16 additions & 5 deletions gpustat/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,13 @@ def clk_freq(self) -> Optional[int]:
v = self.entry['clk_freq']
return int(v) if v is not None else None

@property
def clk_freq_max(self) -> Optional[int]:
"""
"""
v = self.entry['clk_freq_max']
return int(v) if v is not None else None

def print_to(self, fp, *,
with_colors=True, # deprecated arg
show_cmd=False,
Expand Down Expand Up @@ -345,6 +352,8 @@ def __getattr__(self, name): # type: ignore

_write(", ")
_write(rjustify(safe_self.clk_freq, 3), color='CPowU')
_write(" / ")
_write(rjustify(safe_self.clk_freq_max, 3), color='CPowU')
_write(" MHz")

# Memory
Expand Down Expand Up @@ -472,7 +481,7 @@ def _decode(b: Union[str, bytes]) -> str:
assert isinstance(b, str)
return b

def get_gpu_info(handle: NVMLHandle) -> NvidiaGPUInfo:
def get_gpu_info(handle: NVMLHandle, index: int = None) -> NvidiaGPUInfo:
"""Get one GPU information specified by nvml handle"""

def safepcall(fn: Callable[[], Any], error_value: Any):
Expand Down Expand Up @@ -529,7 +538,7 @@ def _wrapped(*args, **kwargs):
return _wrapped

gpu_info = NvidiaGPUInfo()
gpu_info['index'] = N.nvmlDeviceGetIndex(handle)
gpu_info['index'] = N.nvmlDeviceGetIndex(handle) if index is None else index

gpu_info['name'] = _decode(N.nvmlDeviceGetName(handle))
gpu_info['uuid'] = _decode(N.nvmlDeviceGetUUID(handle))
Expand Down Expand Up @@ -557,14 +566,16 @@ def _wrapped(*args, **kwargs):

# Power
power = safenvml(N.nvmlDeviceGetPowerUsage)(handle)
gpu_info['power.draw'] = power // 1000 if power is not None else None
gpu_info['power.draw'] = power if power is not None else None

power_limit = safenvml(N.nvmlDeviceGetEnforcedPowerLimit)(handle)
gpu_info['enforced.power.limit'] = power_limit // 1000 if power_limit is not None else None
gpu_info['enforced.power.limit'] = power_limit if power_limit is not None else None

# Frequency
freq = safenvml(N.nvmlDeviceGetClkFreq)(handle)
gpu_info['clk_freq'] = freq if freq is not None else None
freq_max = safenvml(N.nvmlDeviceGetClkFreqMax)(handle)
gpu_info['clk_freq_max'] = freq_max if freq_max is not None else None

# Processes
nv_comp_processes = safenvml(N.nvmlDeviceGetComputeRunningProcesses)(handle)
Expand Down Expand Up @@ -627,7 +638,7 @@ def _wrapped(*args, **kwargs):
for index in gpus_to_query:
try:
handle: NVMLHandle = N.nvmlDeviceGetHandleByIndex(index)
gpu_info = get_gpu_info(handle)
gpu_info = get_gpu_info(handle, index)
gpu_stat = GPUStat(gpu_info)
except N.NVMLError_Unknown as e:
gpu_stat = InvalidGPU(index, "((Unknown Error))", e)
Expand Down
51 changes: 31 additions & 20 deletions gpustat/rocml.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from collections import namedtuple


from pyrsmi import rocml
from amdsmi import *

NVML_TEMPERATURE_GPU = 1

Expand Down Expand Up @@ -40,25 +40,25 @@ def silent_run(to_call, *args, **kwargs):
return retval

def nvmlDeviceGetCount():
return silent_run(rocml.smi_get_device_count)
return len(amdsmi_get_processor_handles())

def nvmlDeviceGetHandleByIndex(dev):
return dev
return amdsmi_get_processor_handles()[dev]

def nvmlDeviceGetIndex(dev):
return dev
return -1

def nvmlDeviceGetName(dev):
return silent_run(rocml.smi_get_device_name, dev)
return amdsmi_get_gpu_board_info(dev)["product_name"]

def nvmlDeviceGetUUID(dev):
return silent_run(rocml.smi_get_device_uuid, dev)
return amdsmi_get_gpu_device_uuid(dev)

def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU):
return silent_run(rocml.smi_get_device_temp, dev, loc)
return amdsmi_get_temp_metric(dev, AmdSmiTemperatureType.HOTSPOT, AmdSmiTemperatureMetric.CURRENT)

def nvmlSystemGetDriverVersion():
return silent_run(rocml.smi_get_kernel_version)
return ""

def check_driver_nvml_version(driver_version_str: str):
"""Show warnings when an incompatible driver is used."""
Expand All @@ -76,18 +76,21 @@ def safeint(v) -> int:
warnings.warn(f"This version of ROCM Driver {driver_version_str} is untested, ")

def nvmlDeviceGetFanSpeed(dev):
return silent_run(rocml.smi_get_device_fan_speed, dev)
try:
return amdsmi_get_gpu_fan_speed(dev, 0)
except Exception:
return None

MemoryInfo = namedtuple('MemoryInfo', ['total', 'used'])

def nvmlDeviceGetMemoryInfo(dev):
return MemoryInfo(total=silent_run(rocml.smi_get_device_memory_total, dev),
used=silent_run(rocml.smi_get_device_memory_used, dev))
return MemoryInfo(total=amdsmi_get_gpu_memory_total(dev, AmdSmiMemoryType.VRAM),
used=amdsmi_get_gpu_memory_usage(dev, AmdSmiMemoryType.VRAM))

UtilizationRates = namedtuple('UtilizationRates', ['gpu'])

def nvmlDeviceGetUtilizationRates(dev):
return UtilizationRates(gpu=silent_run(rocml.smi_get_device_utilization, dev))
return UtilizationRates(gpu=amdsmi_get_gpu_activity(dev)["gfx_activity"])

def nvmlDeviceGetEncoderUtilization(dev):
return None
Expand All @@ -96,33 +99,41 @@ def nvmlDeviceGetDecoderUtilization(dev):
return None

def nvmlDeviceGetPowerUsage(dev):
return silent_run(rocml.smi_get_device_average_power, dev)
return amdsmi_get_power_info(dev)["current_socket_power"]

def nvmlDeviceGetEnforcedPowerLimit(dev):
return None
return amdsmi_get_power_info(dev)["power_limit"]

ComputeProcess = namedtuple('ComputeProcess', ['pid'])
ComputeProcess = namedtuple('ComputeProcess', ['pid', 'usedGpuMemory'])

def nvmlDeviceGetComputeRunningProcesses(dev):
processes = silent_run(rocml.smi_get_device_compute_process)
return [ComputeProcess(pid=i) for i in processes]
results = amdsmi_get_gpu_process_list(dev)
return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results]

def nvmlDeviceGetGraphicsRunningProcesses(dev):
return None

def nvmlDeviceGetClkFreq(dev):
return rocml.smi_get_device_freq(dev)
result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS)
if "clk" in result:
return result["clk"]
else:
return result["cur_clk"]

def nvmlDeviceGetClkFreqMax(dev):
result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS)
return result["max_clk"]

# Upon importing this module, let rocml be initialized and remain active
# throughout the lifespan of the python process (until gpustat exists).
_initialized: bool
_init_error = None
try:
rocml.smi_initialize()
amdsmi_init()
_initialized = True

def _shutdown():
rocml.smi_shutdown()
amdsmi_shut_down()
atexit.register(_shutdown)

except Exception as exc:
Expand Down

0 comments on commit cc2d0f0

Please sign in to comment.