Skip to content

Commit

Permalink
WMS: GPU entity fixes (#34490)
Browse files Browse the repository at this point in the history
  • Loading branch information
val06 authored Feb 27, 2025
1 parent 49f98c6 commit 8a9ff47
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 30 deletions.
16 changes: 12 additions & 4 deletions comp/core/workloadmeta/collectors/internal/nvml/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,22 @@ func (c *collector) fillAttributes(gpuDeviceInfo *workloadmeta.GPU, device nvml.
gpuDeviceInfo.ComputeCapability.Minor = minor
}

devAttr, ret := device.GetAttributes()
totalCores, ret := device.GetNumGpuCores()
if ret != nvml.SUCCESS {
if logLimiter.ShouldLog() {
log.Warnf("failed to get device attributes for device index %d: %v", gpuDeviceInfo.Index, nvml.ErrorString(ret))
log.Warnf("failed to get total number of cores for the device %d: %v", gpuDeviceInfo.Index, nvml.ErrorString(ret))
}
} else {
gpuDeviceInfo.TotalCores = totalCores
}

totalMemory, ret := device.GetMemoryInfo()
if ret != nvml.SUCCESS {
if logLimiter.ShouldLog() {
log.Warnf("failed to get total available memory for the device %d: %v", gpuDeviceInfo.Index, nvml.ErrorString(ret))
}
} else {
gpuDeviceInfo.SMCount = int(devAttr.MultiprocessorCount)
gpuDeviceInfo.TotalMemoryMB = devAttr.MemorySizeMB
gpuDeviceInfo.TotalMemory = totalMemory.Total
}

memBusWidth, ret := device.GetMemoryBusWidth()
Expand Down
7 changes: 4 additions & 3 deletions comp/core/workloadmeta/collectors/internal/nvml/nvml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ func TestPull(t *testing.T) {

gpus := wmetaMock.ListGPUs()
require.Equal(t, len(testutil.GPUUUIDs), len(gpus))

var expectedActivePIDs []int
for _, proc := range testutil.DefaultProcessInfo {
expectedActivePIDs = append(expectedActivePIDs, int(proc.Pid))
Expand All @@ -42,14 +41,16 @@ func TestPull(t *testing.T) {
foundIDs := make(map[string]bool)
for _, gpu := range gpus {
foundIDs[gpu.ID] = true

require.Equal(t, testutil.DefaultNvidiaDriverVersion, gpu.DriverVersion)
require.Equal(t, nvidiaVendor, gpu.Vendor)
require.Equal(t, testutil.DefaultGPUName, gpu.Name)
require.Equal(t, testutil.DefaultGPUName, gpu.Device)
require.Equal(t, "hopper", gpu.Architecture)
require.Equal(t, testutil.DefaultGPUComputeCapMajor, gpu.ComputeCapability.Major)
require.Equal(t, testutil.DefaultGPUComputeCapMinor, gpu.ComputeCapability.Minor)
require.Equal(t, int(testutil.DefaultGPUAttributes.MultiprocessorCount), gpu.SMCount)
require.Equal(t, testutil.DefaultTotalMemory, gpu.TotalMemory)
require.Equal(t, testutil.DefaultMaxClockRates[workloadmeta.GPUSM], gpu.MaxClockRates[workloadmeta.GPUSM])
require.Equal(t, testutil.DefaultMaxClockRates[workloadmeta.GPUMemory], gpu.MaxClockRates[workloadmeta.GPUMemory])
require.Equal(t, expectedActivePIDs, gpu.ActivePIDs)
}

Expand Down
13 changes: 7 additions & 6 deletions comp/core/workloadmeta/def/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -1403,11 +1403,12 @@ type GPU struct {
// ComputeCapability contains the compute capability version of the GPU. Optional, can be 0/0
ComputeCapability GPUComputeCapability

// SMCount is the number of streaming multiprocessors in the GPU. Optional, can be empty.
SMCount int
// Total number of cores available for the device,
// this is a number that represents number of SMs * number of cores per SM (depends on the model)
TotalCores int

//TotalMemory is the total available memory for the device in MB
TotalMemoryMB uint64
//TotalMemory is the total available memory for the device in bytes
TotalMemory uint64

// MaxClockRates contains the maximum clock rates for SM and Memory
MaxClockRates [GPUCOUNT]uint32
Expand Down Expand Up @@ -1485,8 +1486,8 @@ func (g GPU) String(verbose bool) string {
_, _ = fmt.Fprintln(&sb, "Index:", g.Index)
_, _ = fmt.Fprintln(&sb, "Architecture:", g.Architecture)
_, _ = fmt.Fprintln(&sb, "Compute Capability:", g.ComputeCapability)
_, _ = fmt.Fprintln(&sb, "Streaming Multiprocessor Count:", g.SMCount)
_, _ = fmt.Fprintln(&sb, "Total Memory (in MB):", g.TotalMemoryMB)
_, _ = fmt.Fprintln(&sb, "Total Number of Cores:", g.TotalCores)
_, _ = fmt.Fprintln(&sb, "Device Total Memory (in bytes):", g.TotalMemory)
_, _ = fmt.Fprintln(&sb, "Memory Bus Width:", g.MemoryBusWidth)
_, _ = fmt.Fprintln(&sb, "Max SM Clock Rate:", g.MaxClockRates[GPUSM])
_, _ = fmt.Fprintln(&sb, "Max Memory Clock Rate:", g.MaxClockRates[GPUMemory])
Expand Down
20 changes: 8 additions & 12 deletions comp/core/workloadmeta/def/types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,10 @@ func TestMergeGPU(t *testing.T) {
EntityMeta: EntityMeta{
Name: "gpu-1",
},
Vendor: "nvidia",
DriverVersion: "460.32.03",
Device: "",
ActivePIDs: []int{123, 456},
TotalMemoryMB: 4096,
MemoryBusWidth: 256,
Vendor: "nvidia",
DriverVersion: "460.32.03",
Device: "",
ActivePIDs: []int{123, 456},
}
gpu2 := GPU{
EntityID: EntityID{
Expand All @@ -177,12 +175,10 @@ func TestMergeGPU(t *testing.T) {
EntityMeta: EntityMeta{
Name: "gpu-1",
},
Vendor: "nvidia",
DriverVersion: "460.32.03",
Device: "tesla",
ActivePIDs: []int{654},
TotalMemoryMB: 4096,
MemoryBusWidth: 256,
Vendor: "nvidia",
DriverVersion: "460.32.03",
Device: "tesla",
ActivePIDs: []int{654},
}

err := gpu1.Merge(&gpu2)
Expand Down
18 changes: 13 additions & 5 deletions pkg/gpu/testutil/mocks.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ var DefaultGpuUUID = GPUUUIDs[0]
// DefaultGPUName is the name for the default device returned by the mock
var DefaultGPUName = "Tesla T4"

// DefaultNvidiaDriverVersion is the default nvidia driver version
var DefaultNvidiaDriverVersion = "470.57.02"

// DefaultMemoryBusWidth is the memory bus width for the default device returned by the mock
var DefaultMemoryBusWidth = uint32(256)

// DefaultGPUComputeCapMajor is the major number for the compute capabilities for the default device returned by the mock
var DefaultGPUComputeCapMajor = 7

Expand All @@ -62,7 +68,6 @@ var DefaultGPUArch = nvml.DeviceArchitecture(nvml.DEVICE_ARCH_HOPPER)
// DefaultGPUAttributes is the attributes for the default device returned by the mock
var DefaultGPUAttributes = nvml.DeviceAttributes{
MultiprocessorCount: 10,
MemorySizeMB: 4096,
}

// DefaultProcessInfo is the list of processes running on the default device returned by the mock
Expand All @@ -74,6 +79,9 @@ var DefaultProcessInfo = []nvml.ProcessInfo{
// DefaultTotalMemory is the total memory for the default device returned by the mock
var DefaultTotalMemory = uint64(1000)

// DefaultMaxClockRates is an array of Max SM clock and Max Mem Clock rates for the default device
var DefaultMaxClockRates = [2]uint32{1000, 2000}

// GetDeviceMock returns a mock of the nvml.Device with the given UUID.
func GetDeviceMock(deviceIdx int) *nvmlmock.Device {
return &nvmlmock.Device{
Expand Down Expand Up @@ -102,14 +110,14 @@ func GetDeviceMock(deviceIdx int) *nvmlmock.Device {
return nvml.Memory{Total: DefaultTotalMemory, Free: 500}, nvml.SUCCESS
},
GetMemoryBusWidthFunc: func() (uint32, nvml.Return) {
return 256, nvml.SUCCESS
return DefaultMemoryBusWidth, nvml.SUCCESS
},
GetMaxClockInfoFunc: func(clockType nvml.ClockType) (uint32, nvml.Return) {
switch clockType {
case nvml.CLOCK_SM:
return 1000, nvml.SUCCESS
return DefaultMaxClockRates[0], nvml.SUCCESS
case nvml.CLOCK_MEM:
return 2000, nvml.SUCCESS
return DefaultMaxClockRates[1], nvml.SUCCESS
default:
return 0, nvml.ERROR_NOT_SUPPORTED
}
Expand Down Expand Up @@ -148,7 +156,7 @@ func GetBasicNvmlMock() *nvmlmock.Interface {
return nvml.Memory{Total: DefaultTotalMemory, Free: 500}, nvml.SUCCESS
},
SystemGetDriverVersionFunc: func() (string, nvml.Return) {
return "470.57.02", nvml.SUCCESS
return DefaultNvidiaDriverVersion, nvml.SUCCESS
},
}
}
Expand Down

0 comments on commit 8a9ff47

Please sign in to comment.