From cc2d0f03d3979d165cff3babb38f244715e889ce Mon Sep 17 00:00:00 2001 From: brnelson Date: Wed, 31 Jul 2024 19:51:55 +0000 Subject: [PATCH] Switching to amdsmi --- gpustat/core.py | 21 +++++++++++++++----- gpustat/rocml.py | 51 +++++++++++++++++++++++++++++------------------- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/gpustat/core.py b/gpustat/core.py index eeedb2e..f7cdd35 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -213,6 +213,13 @@ def clk_freq(self) -> Optional[int]: v = self.entry['clk_freq'] return int(v) if v is not None else None + @property + def clk_freq_max(self) -> Optional[int]: + """ + """ + v = self.entry['clk_freq_max'] + return int(v) if v is not None else None + def print_to(self, fp, *, with_colors=True, # deprecated arg show_cmd=False, @@ -345,6 +352,8 @@ def __getattr__(self, name): # type: ignore _write(", ") _write(rjustify(safe_self.clk_freq, 3), color='CPowU') + _write(" / ") + _write(rjustify(safe_self.clk_freq_max, 3), color='CPowU') _write(" MHz") # Memory @@ -472,7 +481,7 @@ def _decode(b: Union[str, bytes]) -> str: assert isinstance(b, str) return b - def get_gpu_info(handle: NVMLHandle) -> NvidiaGPUInfo: + def get_gpu_info(handle: NVMLHandle, index: int = None) -> NvidiaGPUInfo: """Get one GPU information specified by nvml handle""" def safepcall(fn: Callable[[], Any], error_value: Any): @@ -529,7 +538,7 @@ def _wrapped(*args, **kwargs): return _wrapped gpu_info = NvidiaGPUInfo() - gpu_info['index'] = N.nvmlDeviceGetIndex(handle) + gpu_info['index'] = N.nvmlDeviceGetIndex(handle) if index is None else index gpu_info['name'] = _decode(N.nvmlDeviceGetName(handle)) gpu_info['uuid'] = _decode(N.nvmlDeviceGetUUID(handle)) @@ -557,14 +566,16 @@ def _wrapped(*args, **kwargs): # Power power = safenvml(N.nvmlDeviceGetPowerUsage)(handle) - gpu_info['power.draw'] = power // 1000 if power is not None else None + gpu_info['power.draw'] = power if power is not None else None power_limit = safenvml(N.nvmlDeviceGetEnforcedPowerLimit)(handle) - gpu_info['enforced.power.limit'] = power_limit // 1000 if power_limit is not None else None + gpu_info['enforced.power.limit'] = power_limit if power_limit is not None else None # Frequency freq = safenvml(N.nvmlDeviceGetClkFreq)(handle) gpu_info['clk_freq'] = freq if freq is not None else None + freq_max = safenvml(N.nvmlDeviceGetClkFreqMax)(handle) + gpu_info['clk_freq_max'] = freq_max if freq_max is not None else None # Processes nv_comp_processes = safenvml(N.nvmlDeviceGetComputeRunningProcesses)(handle) @@ -627,7 +638,7 @@ def _wrapped(*args, **kwargs): for index in gpus_to_query: try: handle: NVMLHandle = N.nvmlDeviceGetHandleByIndex(index) - gpu_info = get_gpu_info(handle) + gpu_info = get_gpu_info(handle, index) gpu_stat = GPUStat(gpu_info) except N.NVMLError_Unknown as e: gpu_stat = InvalidGPU(index, "((Unknown Error))", e) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index fadc8f8..06c7033 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -12,7 +12,7 @@ from collections import namedtuple -from pyrsmi import rocml +from amdsmi import * NVML_TEMPERATURE_GPU = 1 @@ -40,25 +40,25 @@ def silent_run(to_call, *args, **kwargs): return retval def nvmlDeviceGetCount(): - return silent_run(rocml.smi_get_device_count) + return len(amdsmi_get_processor_handles()) def nvmlDeviceGetHandleByIndex(dev): - return dev + return amdsmi_get_processor_handles()[dev] def nvmlDeviceGetIndex(dev): - return dev + return -1 def nvmlDeviceGetName(dev): - return silent_run(rocml.smi_get_device_name, dev) + return amdsmi_get_gpu_board_info(dev)["product_name"] def nvmlDeviceGetUUID(dev): - return silent_run(rocml.smi_get_device_uuid, dev) + return amdsmi_get_gpu_device_uuid(dev) def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU): - return silent_run(rocml.smi_get_device_temp, dev, loc) + return amdsmi_get_temp_metric(dev, AmdSmiTemperatureType.HOTSPOT, AmdSmiTemperatureMetric.CURRENT) def nvmlSystemGetDriverVersion(): - return silent_run(rocml.smi_get_kernel_version) + return "" def check_driver_nvml_version(driver_version_str: str): """Show warnings when an incompatible driver is used.""" @@ -76,18 +76,21 @@ def safeint(v) -> int: warnings.warn(f"This version of ROCM Driver {driver_version_str} is untested, ") def nvmlDeviceGetFanSpeed(dev): - return silent_run(rocml.smi_get_device_fan_speed, dev) + try: + return amdsmi_get_gpu_fan_speed(dev, 0) + except Exception: + return None MemoryInfo = namedtuple('MemoryInfo', ['total', 'used']) def nvmlDeviceGetMemoryInfo(dev): - return MemoryInfo(total=silent_run(rocml.smi_get_device_memory_total, dev), - used=silent_run(rocml.smi_get_device_memory_used, dev)) + return MemoryInfo(total=amdsmi_get_gpu_memory_total(dev, AmdSmiMemoryType.VRAM), + used=amdsmi_get_gpu_memory_usage(dev, AmdSmiMemoryType.VRAM)) UtilizationRates = namedtuple('UtilizationRates', ['gpu']) def nvmlDeviceGetUtilizationRates(dev): - return UtilizationRates(gpu=silent_run(rocml.smi_get_device_utilization, dev)) + return UtilizationRates(gpu=amdsmi_get_gpu_activity(dev)["gfx_activity"]) def nvmlDeviceGetEncoderUtilization(dev): return None @@ -96,33 +99,41 @@ def nvmlDeviceGetDecoderUtilization(dev): return None def nvmlDeviceGetPowerUsage(dev): - return silent_run(rocml.smi_get_device_average_power, dev) + return amdsmi_get_power_info(dev)["current_socket_power"] def nvmlDeviceGetEnforcedPowerLimit(dev): - return None + return amdsmi_get_power_info(dev)["power_limit"] -ComputeProcess = namedtuple('ComputeProcess', ['pid']) +ComputeProcess = namedtuple('ComputeProcess', ['pid', 'usedGpuMemory']) def nvmlDeviceGetComputeRunningProcesses(dev): - processes = silent_run(rocml.smi_get_device_compute_process) - return [ComputeProcess(pid=i) for i in processes] + results = amdsmi_get_gpu_process_list(dev) + return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results] def nvmlDeviceGetGraphicsRunningProcesses(dev): return None def nvmlDeviceGetClkFreq(dev): - return rocml.smi_get_device_freq(dev) + result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS) + if "clk" in result: + return result["clk"] + else: + return result["cur_clk"] + +def nvmlDeviceGetClkFreqMax(dev): + result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS) + return result["max_clk"] # Upon importing this module, let rocml be initialized and remain active # throughout the lifespan of the python process (until gpustat exists). _initialized: bool _init_error = None try: - rocml.smi_initialize() + amdsmi_init() _initialized = True def _shutdown(): - rocml.smi_shutdown() + amdsmi_shut_down() atexit.register(_shutdown) except Exception as exc: