WMS: GPU entity fixes (#34490)

DataDog · Feb 27, 2025 · 8a9ff47 · 8a9ff47
1 parent 49f98c6
commit 8a9ff47
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 30 deletions.
diff --git a/comp/core/workloadmeta/collectors/internal/nvml/nvml.go b/comp/core/workloadmeta/collectors/internal/nvml/nvml.go
@@ -170,14 +170,22 @@ func (c *collector) fillAttributes(gpuDeviceInfo *workloadmeta.GPU, device nvml.
 		gpuDeviceInfo.ComputeCapability.Minor = minor
 	}
 
-	devAttr, ret := device.GetAttributes()
+	totalCores, ret := device.GetNumGpuCores()
 	if ret != nvml.SUCCESS {
 		if logLimiter.ShouldLog() {
-			log.Warnf("failed to get device attributes for device index %d: %v", gpuDeviceInfo.Index, nvml.ErrorString(ret))
+			log.Warnf("failed to get total number of cores for the device %d: %v", gpuDeviceInfo.Index, nvml.ErrorString(ret))
+		}
+	} else {
+		gpuDeviceInfo.TotalCores = totalCores
+	}
+
+	totalMemory, ret := device.GetMemoryInfo()
+	if ret != nvml.SUCCESS {
+		if logLimiter.ShouldLog() {
+			log.Warnf("failed to get total available memory for the device %d: %v", gpuDeviceInfo.Index, nvml.ErrorString(ret))
 		}
 	} else {
-		gpuDeviceInfo.SMCount = int(devAttr.MultiprocessorCount)
-		gpuDeviceInfo.TotalMemoryMB = devAttr.MemorySizeMB
+		gpuDeviceInfo.TotalMemory = totalMemory.Total
 	}
 
 	memBusWidth, ret := device.GetMemoryBusWidth()

diff --git a/comp/core/workloadmeta/collectors/internal/nvml/nvml_test.go b/comp/core/workloadmeta/collectors/internal/nvml/nvml_test.go
@@ -33,7 +33,6 @@ func TestPull(t *testing.T) {
 
 	gpus := wmetaMock.ListGPUs()
 	require.Equal(t, len(testutil.GPUUUIDs), len(gpus))
-
 	var expectedActivePIDs []int
 	for _, proc := range testutil.DefaultProcessInfo {
 		expectedActivePIDs = append(expectedActivePIDs, int(proc.Pid))
@@ -42,14 +41,16 @@ func TestPull(t *testing.T) {
 	foundIDs := make(map[string]bool)
 	for _, gpu := range gpus {
 		foundIDs[gpu.ID] = true
-
+		require.Equal(t, testutil.DefaultNvidiaDriverVersion, gpu.DriverVersion)
 		require.Equal(t, nvidiaVendor, gpu.Vendor)
 		require.Equal(t, testutil.DefaultGPUName, gpu.Name)
 		require.Equal(t, testutil.DefaultGPUName, gpu.Device)
 		require.Equal(t, "hopper", gpu.Architecture)
 		require.Equal(t, testutil.DefaultGPUComputeCapMajor, gpu.ComputeCapability.Major)
 		require.Equal(t, testutil.DefaultGPUComputeCapMinor, gpu.ComputeCapability.Minor)
-		require.Equal(t, int(testutil.DefaultGPUAttributes.MultiprocessorCount), gpu.SMCount)
+		require.Equal(t, testutil.DefaultTotalMemory, gpu.TotalMemory)
+		require.Equal(t, testutil.DefaultMaxClockRates[workloadmeta.GPUSM], gpu.MaxClockRates[workloadmeta.GPUSM])
+		require.Equal(t, testutil.DefaultMaxClockRates[workloadmeta.GPUMemory], gpu.MaxClockRates[workloadmeta.GPUMemory])
 		require.Equal(t, expectedActivePIDs, gpu.ActivePIDs)
 	}
 

diff --git a/comp/core/workloadmeta/def/types.go b/comp/core/workloadmeta/def/types.go
@@ -1403,11 +1403,12 @@ type GPU struct {
 	// ComputeCapability contains the compute capability version of the GPU. Optional, can be 0/0
 	ComputeCapability GPUComputeCapability
 
-	// SMCount is the number of streaming multiprocessors in the GPU. Optional, can be empty.
-	SMCount int
+	// Total number of cores available for the device,
+	// this is a number that represents number of SMs * number of cores per SM (depends on the model)
+	TotalCores int
 
-	//TotalMemory is the total available memory for the device in MB
-	TotalMemoryMB uint64
+	//TotalMemory is the total available memory for the device in bytes
+	TotalMemory uint64
 
 	// MaxClockRates contains the maximum clock rates for SM and Memory
 	MaxClockRates [GPUCOUNT]uint32
@@ -1485,8 +1486,8 @@ func (g GPU) String(verbose bool) string {
 	_, _ = fmt.Fprintln(&sb, "Index:", g.Index)
 	_, _ = fmt.Fprintln(&sb, "Architecture:", g.Architecture)
 	_, _ = fmt.Fprintln(&sb, "Compute Capability:", g.ComputeCapability)
-	_, _ = fmt.Fprintln(&sb, "Streaming Multiprocessor Count:", g.SMCount)
-	_, _ = fmt.Fprintln(&sb, "Total Memory (in MB):", g.TotalMemoryMB)
+	_, _ = fmt.Fprintln(&sb, "Total Number of Cores:", g.TotalCores)
+	_, _ = fmt.Fprintln(&sb, "Device Total Memory (in bytes):", g.TotalMemory)
 	_, _ = fmt.Fprintln(&sb, "Memory Bus Width:", g.MemoryBusWidth)
 	_, _ = fmt.Fprintln(&sb, "Max SM Clock Rate:", g.MaxClockRates[GPUSM])
 	_, _ = fmt.Fprintln(&sb, "Max Memory Clock Rate:", g.MaxClockRates[GPUMemory])

diff --git a/comp/core/workloadmeta/def/types_test.go b/comp/core/workloadmeta/def/types_test.go
@@ -162,12 +162,10 @@ func TestMergeGPU(t *testing.T) {
 		EntityMeta: EntityMeta{
 			Name: "gpu-1",
 		},
-		Vendor:         "nvidia",
-		DriverVersion:  "460.32.03",
-		Device:         "",
-		ActivePIDs:     []int{123, 456},
-		TotalMemoryMB:  4096,
-		MemoryBusWidth: 256,
+		Vendor:        "nvidia",
+		DriverVersion: "460.32.03",
+		Device:        "",
+		ActivePIDs:    []int{123, 456},
 	}
 	gpu2 := GPU{
 		EntityID: EntityID{
@@ -177,12 +175,10 @@ func TestMergeGPU(t *testing.T) {
 		EntityMeta: EntityMeta{
 			Name: "gpu-1",
 		},
-		Vendor:         "nvidia",
-		DriverVersion:  "460.32.03",
-		Device:         "tesla",
-		ActivePIDs:     []int{654},
-		TotalMemoryMB:  4096,
-		MemoryBusWidth: 256,
+		Vendor:        "nvidia",
+		DriverVersion: "460.32.03",
+		Device:        "tesla",
+		ActivePIDs:    []int{654},
 	}
 
 	err := gpu1.Merge(&gpu2)

diff --git a/pkg/gpu/testutil/mocks.go b/pkg/gpu/testutil/mocks.go
@@ -50,6 +50,12 @@ var DefaultGpuUUID = GPUUUIDs[0]
 // DefaultGPUName is the name for the default device returned by the mock
 var DefaultGPUName = "Tesla T4"
 
+// DefaultNvidiaDriverVersion is the default nvidia driver version
+var DefaultNvidiaDriverVersion = "470.57.02"
+
+// DefaultMemoryBusWidth is the memory bus width for the default device returned by the mock
+var DefaultMemoryBusWidth = uint32(256)
+
 // DefaultGPUComputeCapMajor is the major number for the compute capabilities for the default device returned by the mock
 var DefaultGPUComputeCapMajor = 7
 
@@ -62,7 +68,6 @@ var DefaultGPUArch = nvml.DeviceArchitecture(nvml.DEVICE_ARCH_HOPPER)
 // DefaultGPUAttributes is the attributes for the default device returned by the mock
 var DefaultGPUAttributes = nvml.DeviceAttributes{
 	MultiprocessorCount: 10,
-	MemorySizeMB:        4096,
 }
 
 // DefaultProcessInfo is the list of processes running on the default device returned by the mock
@@ -74,6 +79,9 @@ var DefaultProcessInfo = []nvml.ProcessInfo{
 // DefaultTotalMemory is the total memory for the default device returned by the mock
 var DefaultTotalMemory = uint64(1000)
 
+// DefaultMaxClockRates is an array of Max SM clock and Max Mem Clock rates for the default device
+var DefaultMaxClockRates = [2]uint32{1000, 2000}
+
 // GetDeviceMock returns a mock of the nvml.Device with the given UUID.
 func GetDeviceMock(deviceIdx int) *nvmlmock.Device {
 	return &nvmlmock.Device{
@@ -102,14 +110,14 @@ func GetDeviceMock(deviceIdx int) *nvmlmock.Device {
 			return nvml.Memory{Total: DefaultTotalMemory, Free: 500}, nvml.SUCCESS
 		},
 		GetMemoryBusWidthFunc: func() (uint32, nvml.Return) {
-			return 256, nvml.SUCCESS
+			return DefaultMemoryBusWidth, nvml.SUCCESS
 		},
 		GetMaxClockInfoFunc: func(clockType nvml.ClockType) (uint32, nvml.Return) {
 			switch clockType {
 			case nvml.CLOCK_SM:
-				return 1000, nvml.SUCCESS
+				return DefaultMaxClockRates[0], nvml.SUCCESS
 			case nvml.CLOCK_MEM:
-				return 2000, nvml.SUCCESS
+				return DefaultMaxClockRates[1], nvml.SUCCESS
 			default:
 				return 0, nvml.ERROR_NOT_SUPPORTED
 			}
@@ -148,7 +156,7 @@ func GetBasicNvmlMock() *nvmlmock.Interface {
 			return nvml.Memory{Total: DefaultTotalMemory, Free: 500}, nvml.SUCCESS
 		},
 		SystemGetDriverVersionFunc: func() (string, nvml.Return) {
-			return "470.57.02", nvml.SUCCESS
+			return DefaultNvidiaDriverVersion, nvml.SUCCESS
 		},
 	}
 }