From cc2d0f03d3979d165cff3babb38f244715e889ce Mon Sep 17 00:00:00 2001
From: brnelson <bryant.nelson@amd.com>
Date: Wed, 31 Jul 2024 19:51:55 +0000
Subject: [PATCH] Switching to amdsmi

---
 gpustat/core.py  | 21 +++++++++++++++-----
 gpustat/rocml.py | 51 +++++++++++++++++++++++++++++-------------------
 2 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/gpustat/core.py b/gpustat/core.py
index eeedb2e..f7cdd35 100644
--- a/gpustat/core.py
+++ b/gpustat/core.py
@@ -213,6 +213,13 @@ def clk_freq(self) -> Optional[int]:
         v = self.entry['clk_freq']
         return int(v) if v is not None else None
 
+    @property
+    def clk_freq_max(self) -> Optional[int]:
+        """
+        """
+        v = self.entry['clk_freq_max']
+        return int(v) if v is not None else None
+
     def print_to(self, fp, *,
                  with_colors=True,    # deprecated arg
                  show_cmd=False,
@@ -345,6 +352,8 @@ def __getattr__(self, name):  # type: ignore
                 
         _write(",  ")
         _write(rjustify(safe_self.clk_freq, 3), color='CPowU')
+        _write(" / ")
+        _write(rjustify(safe_self.clk_freq_max, 3), color='CPowU')
         _write(" MHz")
 
         # Memory
@@ -472,7 +481,7 @@ def _decode(b: Union[str, bytes]) -> str:
             assert isinstance(b, str)
             return b
 
-        def get_gpu_info(handle: NVMLHandle) -> NvidiaGPUInfo:
+        def get_gpu_info(handle: NVMLHandle, index: int = None) -> NvidiaGPUInfo:
             """Get one GPU information specified by nvml handle"""
 
             def safepcall(fn: Callable[[], Any], error_value: Any):
@@ -529,7 +538,7 @@ def _wrapped(*args, **kwargs):
                 return _wrapped
 
             gpu_info = NvidiaGPUInfo()
-            gpu_info['index'] = N.nvmlDeviceGetIndex(handle)
+            gpu_info['index'] = N.nvmlDeviceGetIndex(handle) if index is None else index
 
             gpu_info['name'] = _decode(N.nvmlDeviceGetName(handle))
             gpu_info['uuid'] = _decode(N.nvmlDeviceGetUUID(handle))
@@ -557,14 +566,16 @@ def _wrapped(*args, **kwargs):
 
             # Power
             power = safenvml(N.nvmlDeviceGetPowerUsage)(handle)
-            gpu_info['power.draw'] = power // 1000 if power is not None else None
+            gpu_info['power.draw'] = power if power is not None else None
 
             power_limit = safenvml(N.nvmlDeviceGetEnforcedPowerLimit)(handle)
-            gpu_info['enforced.power.limit'] = power_limit // 1000 if power_limit is not None else None
+            gpu_info['enforced.power.limit'] = power_limit if power_limit is not None else None
 
             # Frequency
             freq = safenvml(N.nvmlDeviceGetClkFreq)(handle)
             gpu_info['clk_freq'] = freq if freq is not None else None
+            freq_max = safenvml(N.nvmlDeviceGetClkFreqMax)(handle)
+            gpu_info['clk_freq_max'] = freq_max if freq_max is not None else None
 
             # Processes
             nv_comp_processes = safenvml(N.nvmlDeviceGetComputeRunningProcesses)(handle)
@@ -627,7 +638,7 @@ def _wrapped(*args, **kwargs):
         for index in gpus_to_query:
             try:
                 handle: NVMLHandle = N.nvmlDeviceGetHandleByIndex(index)
-                gpu_info = get_gpu_info(handle)
+                gpu_info = get_gpu_info(handle, index)
                 gpu_stat = GPUStat(gpu_info)
             except N.NVMLError_Unknown as e:
                 gpu_stat = InvalidGPU(index, "((Unknown Error))", e)
diff --git a/gpustat/rocml.py b/gpustat/rocml.py
index fadc8f8..06c7033 100644
--- a/gpustat/rocml.py
+++ b/gpustat/rocml.py
@@ -12,7 +12,7 @@
 from collections import namedtuple
 
 
-from pyrsmi import rocml
+from amdsmi import *
 
 NVML_TEMPERATURE_GPU = 1
 
@@ -40,25 +40,25 @@ def silent_run(to_call, *args, **kwargs):
     return retval
 
 def nvmlDeviceGetCount():
-    return silent_run(rocml.smi_get_device_count)
+    return len(amdsmi_get_processor_handles())
 
 def nvmlDeviceGetHandleByIndex(dev):
-    return dev
+    return amdsmi_get_processor_handles()[dev]
 
 def nvmlDeviceGetIndex(dev):
-    return dev
+    return -1
 
 def nvmlDeviceGetName(dev):
-    return silent_run(rocml.smi_get_device_name, dev)
+    return amdsmi_get_gpu_board_info(dev)["product_name"]
 
 def nvmlDeviceGetUUID(dev):
-    return silent_run(rocml.smi_get_device_uuid, dev)
+    return amdsmi_get_gpu_device_uuid(dev)
 
 def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU):
-    return silent_run(rocml.smi_get_device_temp, dev, loc)
+    return amdsmi_get_temp_metric(dev, AmdSmiTemperatureType.HOTSPOT, AmdSmiTemperatureMetric.CURRENT)
 
 def nvmlSystemGetDriverVersion():
-    return silent_run(rocml.smi_get_kernel_version)
+    return ""
 
 def check_driver_nvml_version(driver_version_str: str):
     """Show warnings when an incompatible driver is used."""
@@ -76,18 +76,21 @@ def safeint(v) -> int:
         warnings.warn(f"This version of ROCM Driver {driver_version_str} is untested, ")
 
 def nvmlDeviceGetFanSpeed(dev):
-    return silent_run(rocml.smi_get_device_fan_speed, dev)
+    try:
+        return amdsmi_get_gpu_fan_speed(dev, 0)
+    except Exception:
+        return None
 
 MemoryInfo = namedtuple('MemoryInfo', ['total', 'used'])
 
 def nvmlDeviceGetMemoryInfo(dev):
-    return MemoryInfo(total=silent_run(rocml.smi_get_device_memory_total, dev),
-                      used=silent_run(rocml.smi_get_device_memory_used, dev))
+    return MemoryInfo(total=amdsmi_get_gpu_memory_total(dev, AmdSmiMemoryType.VRAM),
+                      used=amdsmi_get_gpu_memory_usage(dev, AmdSmiMemoryType.VRAM))
 
 UtilizationRates = namedtuple('UtilizationRates', ['gpu'])
 
 def nvmlDeviceGetUtilizationRates(dev):
-    return UtilizationRates(gpu=silent_run(rocml.smi_get_device_utilization, dev))
+    return UtilizationRates(gpu=amdsmi_get_gpu_activity(dev)["gfx_activity"])
 
 def nvmlDeviceGetEncoderUtilization(dev):
     return None
@@ -96,33 +99,41 @@ def nvmlDeviceGetDecoderUtilization(dev):
     return None
 
 def nvmlDeviceGetPowerUsage(dev):
-    return silent_run(rocml.smi_get_device_average_power, dev)
+    return amdsmi_get_power_info(dev)["current_socket_power"]
 
 def nvmlDeviceGetEnforcedPowerLimit(dev):
-    return None
+    return amdsmi_get_power_info(dev)["power_limit"]
 
-ComputeProcess = namedtuple('ComputeProcess', ['pid'])
+ComputeProcess = namedtuple('ComputeProcess', ['pid', 'usedGpuMemory'])
 
 def nvmlDeviceGetComputeRunningProcesses(dev):
-    processes = silent_run(rocml.smi_get_device_compute_process)
-    return [ComputeProcess(pid=i) for i in processes]
+    results = amdsmi_get_gpu_process_list(dev)
+    return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results]
 
 def nvmlDeviceGetGraphicsRunningProcesses(dev):
     return None
 
 def nvmlDeviceGetClkFreq(dev):
-    return rocml.smi_get_device_freq(dev)
+    result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS)
+    if "clk" in result:
+        return result["clk"]
+    else:
+        return result["cur_clk"]
+
+def nvmlDeviceGetClkFreqMax(dev):
+    result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS)
+    return result["max_clk"]
 
 # Upon importing this module, let rocml be initialized and remain active
 # throughout the lifespan of the python process (until gpustat exists).
 _initialized: bool
 _init_error = None
 try:
-    rocml.smi_initialize()
+    amdsmi_init()
     _initialized = True
 
     def _shutdown():
-        rocml.smi_shutdown()
+        amdsmi_shut_down()
     atexit.register(_shutdown)
 
 except Exception as exc: