diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index c78f8434..24b06a33 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -75,6 +75,7 @@ const ( CLIPodResourcesKubeletSocket = "pod-resources-kubelet-socket" CLIHPCJobMappingDir = "hpc-job-mapping-dir" CLINvidiaResourceNames = "nvidia-resource-names" + CLIKubernetesVirtualGPUs = "kubernetes-virtual-gpus" ) func NewApp(buildVersion ...string) *cli.App { @@ -244,6 +245,12 @@ func NewApp(buildVersion ...string) *cli.App { Usage: "Nvidia resource names for specified GPU type like nvidia.com/a100, nvidia.com/a10.", EnvVars: []string{"NVIDIA_RESOURCE_NAMES"}, }, + &cli.BoolFlag{ + Name: CLIKubernetesVirtualGPUs, + Value: false, + Usage: "Capture metrics associated with virtual GPUs exposed by Kubernetes device plugins when using GPU sharing strategies, e.g. time-sharing or MPS.", + EnvVars: []string{"KUBERNETES_VIRTUAL_GPUS"}, + }, } if runtime.GOOS == "linux" { @@ -639,5 +646,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { PodResourcesKubeletSocket: c.String(CLIPodResourcesKubeletSocket), HPCJobMappingDir: c.String(CLIHPCJobMappingDir), NvidiaResourceNames: c.StringSlice(CLINvidiaResourceNames), + KubernetesVirtualGPUs: c.Bool(CLIKubernetesVirtualGPUs), }, nil } diff --git a/pkg/dcgmexporter/config.go b/pkg/dcgmexporter/config.go index f13c91db..cb951b12 100644 --- a/pkg/dcgmexporter/config.go +++ b/pkg/dcgmexporter/config.go @@ -59,4 +59,5 @@ type Config struct { PodResourcesKubeletSocket string HPCJobMappingDir string NvidiaResourceNames []string + KubernetesVirtualGPUs bool } diff --git a/pkg/dcgmexporter/kubernetes.go b/pkg/dcgmexporter/kubernetes.go index 8fb8d7d2..46aa4e9f 100644 --- a/pkg/dcgmexporter/kubernetes.go +++ b/pkg/dcgmexporter/kubernetes.go @@ -36,7 +36,8 @@ import ( var ( connectionTimeout = 10 * time.Second - gkeMigDeviceIDRegex = regexp.MustCompile(`^nvidia([0-9]+)/gi([0-9]+)$`) + // Allow for MIG devices with or without GPU sharing to match. + gkeMigDeviceIDRegex = regexp.MustCompile(`^nvidia([0-9]+)/gi([0-9]+)(/vgpu[0-9]+)?$$`) gkeVirtualGPUDeviceIDSeparator = "/vgpu" nvmlGetMIGDeviceInfoByIDHook = nvmlprovider.GetMIGDeviceInfoByID ) @@ -73,6 +74,53 @@ func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error return err } + if p.Config.KubernetesVirtualGPUs { + deviceToPods := p.toDeviceToSharingPods(pods, sysInfo) + + logrus.Infof("Device to sharing pods mapping: %+v", deviceToPods) + + // For each counter metric, init a slice to collect metrics to associate with shared virtual GPUs. + for counter := range metrics { + var newmetrics []Metric + // For each instrumented device, build list of metrics and create + // new metrics for any shared GPUs. + for j, val := range metrics[counter] { + deviceID, err := val.getIDOfType(p.Config.KubernetesGPUIdType) + if err != nil { + return err + } + + podInfos, _ := deviceToPods[deviceID] + // For all containers using the GPU, extract and annotate a metric + // with the container info and the shared GPU label, if it exists. + // Notably, this will increase the number of unique metrics (i.e. labelsets) + // to by the number of containers sharing the GPU. + for _, pi := range podInfos { + metric, err := deepCopy(metrics[counter][j]) + if err != nil { + return err + } + if !p.Config.UseOldNamespace { + metric.Attributes[podAttribute] = pi.Name + metric.Attributes[namespaceAttribute] = pi.Namespace + metric.Attributes[containerAttribute] = pi.Container + } else { + metric.Attributes[oldPodAttribute] = pi.Name + metric.Attributes[oldNamespaceAttribute] = pi.Namespace + metric.Attributes[oldContainerAttribute] = pi.Container + } + if pi.VGPU != "" { + metric.Attributes[vgpuAttribute] = pi.VGPU + } + newmetrics = append(newmetrics, metric) + } + } + // Upsert the annotated metrics into the final map. + metrics[counter] = newmetrics + } + return nil + } + deviceToPod := p.toDeviceToPod(pods, sysInfo) logrus.Debugf("Device to pod mapping: %+v", deviceToPod) @@ -138,6 +186,86 @@ func (p *PodMapper) listPods(conn *grpc.ClientConn) (*podresourcesapi.ListPodRes return resp, nil } +func getSharedGPU(deviceID string) (string, bool) { + // Check if we're using the GKE device plugin or NVIDIA device plugin. + if strings.Contains(deviceID, gkeVirtualGPUDeviceIDSeparator) { + return strings.Split(deviceID, gkeVirtualGPUDeviceIDSeparator)[1], true + } else if strings.Contains(deviceID, "::") { + return strings.Split(deviceID, "::")[1], true + } + return "", false +} + +// toDeviceToSharingPods uses the same general logic as toDeviceToPod but +// allows for multiple contianers to be associated with a metric when sharing +// strategies are used in Kubernetes. +// TODO(pintohuch): the logic is manually duplicated from toDeviceToPod for +// better isolation and easier review. Ultimately, this logic should be +// merged into a single function that can handle both shared and non-shared +// GPU states. +func (p *PodMapper) toDeviceToSharingPods(devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo SystemInfo) map[string][]PodInfo { + deviceToPodsMap := make(map[string][]PodInfo) + + for _, pod := range devicePods.GetPodResources() { + for _, container := range pod.GetContainers() { + for _, device := range container.GetDevices() { + + resourceName := device.GetResourceName() + if resourceName != nvidiaResourceName && !slices.Contains(p.Config.NvidiaResourceNames, resourceName) { + // Mig resources appear differently than GPU resources + if !strings.HasPrefix(resourceName, nvidiaMigResourcePrefix) { + continue + } + } + + podInfo := PodInfo{ + Name: pod.GetName(), + Namespace: pod.GetNamespace(), + Container: container.GetName(), + } + + for _, deviceID := range device.GetDeviceIds() { + if vgpu, ok := getSharedGPU(deviceID); ok { + podInfo.VGPU = vgpu + } + if strings.HasPrefix(deviceID, MIG_UUID_PREFIX) { + migDevice, err := nvmlGetMIGDeviceInfoByIDHook(deviceID) + if err == nil { + giIdentifier := GetGPUInstanceIdentifier(sysInfo, migDevice.ParentUUID, + uint(migDevice.GPUInstanceID)) + deviceToPodsMap[giIdentifier] = append(deviceToPodsMap[giIdentifier], podInfo) + } + gpuUUID := deviceID[len(MIG_UUID_PREFIX):] + deviceToPodsMap[gpuUUID] = append(deviceToPodsMap[gpuUUID], podInfo) + } else if gkeMigDeviceIDMatches := gkeMigDeviceIDRegex.FindStringSubmatch(deviceID); gkeMigDeviceIDMatches != nil { + var gpuIndex string + var gpuInstanceID string + for groupIdx, group := range gkeMigDeviceIDMatches { + switch groupIdx { + case 1: + gpuIndex = group + case 2: + gpuInstanceID = group + } + } + giIdentifier := fmt.Sprintf("%s-%s", gpuIndex, gpuInstanceID) + deviceToPodsMap[giIdentifier] = append(deviceToPodsMap[giIdentifier], podInfo) + } else if strings.Contains(deviceID, gkeVirtualGPUDeviceIDSeparator) { + deviceToPodsMap[strings.Split(deviceID, gkeVirtualGPUDeviceIDSeparator)[0]] = append(deviceToPodsMap[strings.Split(deviceID, gkeVirtualGPUDeviceIDSeparator)[0]], podInfo) + } else if strings.Contains(deviceID, "::") { + gpuInstanceID := strings.Split(deviceID, "::")[0] + deviceToPodsMap[gpuInstanceID] = append(deviceToPodsMap[gpuInstanceID], podInfo) + } + // Default mapping between deviceID and pod information + deviceToPodsMap[deviceID] = append(deviceToPodsMap[deviceID], podInfo) + } + } + } + } + + return deviceToPodsMap +} + func (p *PodMapper) toDeviceToPod( devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo SystemInfo, ) map[string]PodInfo { diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go index 3b48efe2..6911af1f 100644 --- a/pkg/dcgmexporter/kubernetes_test.go +++ b/pkg/dcgmexporter/kubernetes_test.go @@ -35,48 +35,52 @@ import ( ) func TestProcessPodMapper(t *testing.T) { - testutils.RequireLinux(t) + var kubeVirtualGPUs = []bool{false, true} + for _, virtual := range kubeVirtualGPUs { + testutils.RequireLinux(t) - tmpDir, cleanup := CreateTmpDir(t) - defer cleanup() + tmpDir, cleanup := CreateTmpDir(t) + defer cleanup() - cleanup, err := dcgm.Init(dcgm.Embedded) - require.NoError(t, err) - defer cleanup() + cleanup, err := dcgm.Init(dcgm.Embedded) + require.NoError(t, err) + defer cleanup() - c, cleanup := testDCGMGPUCollector(t, sampleCounters) - defer cleanup() + c, cleanup := testDCGMGPUCollector(t, sampleCounters) + defer cleanup() - out, err := c.GetMetrics() - require.NoError(t, err) + out, err := c.GetMetrics() + require.NoError(t, err) - original := out + original := out - arbirtaryMetric := out[reflect.ValueOf(out).MapKeys()[0].Interface().(Counter)] + arbirtaryMetric := out[reflect.ValueOf(out).MapKeys()[0].Interface().(Counter)] - socketPath := tmpDir + "/kubelet.sock" - server := grpc.NewServer() - gpus := GetGPUUUIDs(arbirtaryMetric) - podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(nvidiaResourceName, gpus)) + socketPath := tmpDir + "/kubelet.sock" + server := grpc.NewServer() + gpus := GetGPUUUIDs(arbirtaryMetric) + podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(nvidiaResourceName, gpus)) - cleanup = StartMockServer(t, server, socketPath) - defer cleanup() + cleanup = StartMockServer(t, server, socketPath) + defer cleanup() - podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: GPUUID, PodResourcesKubeletSocket: socketPath}) - require.NoError(t, err) - var sysInfo SystemInfo - err = podMapper.Process(out, sysInfo) - require.NoError(t, err) + podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: GPUUID, PodResourcesKubeletSocket: socketPath, KubernetesVirtualGPUs: virtual}) + require.NoError(t, err) + var sysInfo SystemInfo + + err = podMapper.Process(out, sysInfo) + require.NoError(t, err) - require.Len(t, out, len(original)) - for _, metrics := range out { - for _, metric := range metrics { - require.Contains(t, metric.Attributes, podAttribute) - require.Contains(t, metric.Attributes, namespaceAttribute) - require.Contains(t, metric.Attributes, containerAttribute) - require.Equal(t, metric.Attributes[podAttribute], fmt.Sprintf("gpu-pod-%s", metric.GPU)) - require.Equal(t, metric.Attributes[namespaceAttribute], "default") - require.Equal(t, metric.Attributes[containerAttribute], "default") + require.Len(t, out, len(original)) + for _, metrics := range out { + for _, metric := range metrics { + require.Contains(t, metric.Attributes, podAttribute) + require.Contains(t, metric.Attributes, namespaceAttribute) + require.Contains(t, metric.Attributes, containerAttribute) + require.Equal(t, metric.Attributes[podAttribute], fmt.Sprintf("gpu-pod-%s", metric.GPU)) + require.Equal(t, metric.Attributes[namespaceAttribute], "default") + require.Equal(t, metric.Attributes[containerAttribute], "default") + } } } } @@ -167,14 +171,16 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { testutils.RequireLinux(t) type TestCase struct { - KubernetesGPUIDType KubernetesGPUIDType - GPUInstanceID uint - ResourceName string - MetricGPUID string - MetricGPUDevice string - MetricMigProfile string - PODGPUID string - NvidiaResourceNames []string + KubernetesGPUIDType KubernetesGPUIDType + GPUInstanceID uint + ResourceName string + MetricGPUID string + MetricGPUDevice string + MetricMigProfile string + PODGPUIDs []string + NvidiaResourceNames []string + KubernetesVirtualGPU bool + VGPUs []string } testCases := []TestCase{ @@ -182,13 +188,13 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { KubernetesGPUIDType: GPUUID, ResourceName: nvidiaResourceName, MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", - PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"b8ea3855-276c-c9cb-b366-c6fa655957c5"}, }, { KubernetesGPUIDType: GPUUID, ResourceName: nvidiaResourceName, MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", - PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5"}, MetricMigProfile: "", }, { @@ -197,39 +203,39 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { GPUInstanceID: 3, MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", MetricMigProfile: "", - PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5"}, }, { KubernetesGPUIDType: DeviceName, ResourceName: nvidiaResourceName, GPUInstanceID: 3, MetricMigProfile: "mig", - PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5"}, }, { KubernetesGPUIDType: DeviceName, ResourceName: nvidiaResourceName, MetricMigProfile: "mig", - PODGPUID: "nvidia0/gi0", + PODGPUIDs: []string{"nvidia0/gi0"}, }, { KubernetesGPUIDType: DeviceName, ResourceName: nvidiaResourceName, MetricGPUDevice: "0", - PODGPUID: "0/vgpu", + PODGPUIDs: []string{"0/vgpu"}, }, { KubernetesGPUIDType: GPUUID, ResourceName: nvidiaResourceName, MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", - PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5::", + PODGPUIDs: []string{"b8ea3855-276c-c9cb-b366-c6fa655957c5::"}, }, { KubernetesGPUIDType: GPUUID, ResourceName: "nvidia.com/mig-1g.10gb", MetricMigProfile: "1g.10gb", MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", - PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5"}, MetricGPUDevice: "0", GPUInstanceID: 3, }, @@ -237,17 +243,161 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { KubernetesGPUIDType: GPUUID, ResourceName: "nvidia.com/a100", MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", - PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"b8ea3855-276c-c9cb-b366-c6fa655957c5"}, NvidiaResourceNames: []string{"nvidia.com/a100"}, }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: nvidiaResourceName, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"b8ea3855-276c-c9cb-b366-c6fa655957c5"}, + KubernetesVirtualGPU: true, + }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: nvidiaResourceName, + MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5"}, + MetricMigProfile: "", + KubernetesVirtualGPU: true, + }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: nvidiaResourceName, + GPUInstanceID: 3, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + MetricMigProfile: "", + PODGPUIDs: []string{"MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5"}, + KubernetesVirtualGPU: true, + }, + { + KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, + GPUInstanceID: 3, + MetricMigProfile: "mig", + PODGPUIDs: []string{"MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5"}, + KubernetesVirtualGPU: true, + }, + { + KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, + MetricMigProfile: "mig", + PODGPUIDs: []string{"nvidia0/gi0"}, + KubernetesVirtualGPU: true, + }, + { + KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, + MetricGPUDevice: "0", + PODGPUIDs: []string{"0/vgpu"}, + KubernetesVirtualGPU: true, + }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: nvidiaResourceName, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"b8ea3855-276c-c9cb-b366-c6fa655957c5::"}, + KubernetesVirtualGPU: true, + }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: "nvidia.com/mig-1g.10gb", + MetricMigProfile: "1g.10gb", + MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5"}, + MetricGPUDevice: "0", + GPUInstanceID: 3, + KubernetesVirtualGPU: true, + }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: "nvidia.com/a100", + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"b8ea3855-276c-c9cb-b366-c6fa655957c5"}, + NvidiaResourceNames: []string{"nvidia.com/a100"}, + KubernetesVirtualGPU: true, + }, + { + KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, + MetricMigProfile: "mig", + PODGPUIDs: []string{"nvidia0/gi3/vgpu0"}, + GPUInstanceID: 3, + KubernetesVirtualGPU: true, + VGPUs: []string{"0"}, + }, + { + KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, + PODGPUIDs: []string{"nvidia0/vgpu1"}, + MetricGPUDevice: "nvidia0", + KubernetesVirtualGPU: true, + VGPUs: []string{"1"}, + }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: nvidiaResourceName, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"b8ea3855-276c-c9cb-b366-c6fa655957c5::2"}, + KubernetesVirtualGPU: true, + VGPUs: []string{"2"}, + }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: "nvidia.com/mig-1g.10gb", + MetricMigProfile: "1g.10gb", + MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5::4"}, + MetricGPUDevice: "0", + GPUInstanceID: 3, + KubernetesVirtualGPU: true, + VGPUs: []string{"4"}, + }, + { + KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, + MetricMigProfile: "mig", + PODGPUIDs: []string{"nvidia0/gi3/vgpu0", "nvidia0/gi3/vgpu1"}, + GPUInstanceID: 3, + KubernetesVirtualGPU: true, + VGPUs: []string{"0", "1"}, + }, + { + KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, + PODGPUIDs: []string{"nvidia0/vgpu1", "nvidia0/vgpu2"}, + MetricGPUDevice: "nvidia0", + KubernetesVirtualGPU: true, + VGPUs: []string{"1", "2"}, + }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: nvidiaResourceName, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"b8ea3855-276c-c9cb-b366-c6fa655957c5::2", "b8ea3855-276c-c9cb-b366-c6fa655957c5::3"}, + KubernetesVirtualGPU: true, + VGPUs: []string{"2", "3"}, + }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: "nvidia.com/mig-1g.10gb", + MetricMigProfile: "1g.10gb", + MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUIDs: []string{"MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5::4", "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5::5"}, + MetricGPUDevice: "0", + GPUInstanceID: 3, + KubernetesVirtualGPU: true, + VGPUs: []string{"4", "5"}, + }, } for _, tc := range testCases { - t.Run(fmt.Sprintf("when type %s, pod device id %s metric device id %s and gpu device %s", + t.Run(fmt.Sprintf("when type %s, pod device ids %s metric device id %s and gpu device %s with virtual GPUs: %t", tc.KubernetesGPUIDType, - tc.PODGPUID, + tc.PODGPUIDs, tc.MetricGPUID, tc.MetricGPUDevice, + tc.KubernetesVirtualGPU, ), func(t *testing.T) { tmpDir, cleanup := CreateTmpDir(t) @@ -259,7 +409,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { require.NoError(t, err) defer cleanup() - gpus := []string{tc.PODGPUID} + gpus := tc.PODGPUIDs podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(tc.ResourceName, gpus)) cleanup = StartMockServer(t, server, socketPath) @@ -281,6 +431,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { KubernetesGPUIdType: tc.KubernetesGPUIDType, PodResourcesKubeletSocket: socketPath, NvidiaResourceNames: tc.NvidiaResourceNames, + KubernetesVirtualGPUs: tc.KubernetesVirtualGPU, }) require.NoError(t, err) require.NotNil(t, podMapper) @@ -321,17 +472,94 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { err = podMapper.Process(metrics, sysInfo) require.NoError(t, err) assert.Len(t, metrics, 1) - for _, metric := range metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] { + if tc.KubernetesVirtualGPU { + assert.Len(t, metrics[counter], len(gpus)) + } + + for i, metric := range metrics[counter] { require.Contains(t, metric.Attributes, podAttribute) require.Contains(t, metric.Attributes, namespaceAttribute) require.Contains(t, metric.Attributes, containerAttribute) // TODO currently we rely on ordering and implicit expectations of the mock implementation // This should be a table comparison - require.Equal(t, fmt.Sprintf("gpu-pod-%d", 0), metric.Attributes[podAttribute]) + require.Equal(t, fmt.Sprintf("gpu-pod-%d", i), metric.Attributes[podAttribute]) require.Equal(t, "default", metric.Attributes[namespaceAttribute]) require.Equal(t, "default", metric.Attributes[containerAttribute]) + + // Assert virtual GPU attributes. + vgpu, ok := metric.Attributes[vgpuAttribute] + // Ensure vgpu attribute only exists when vgpu is enabled. + if ok && !tc.KubernetesVirtualGPU { + t.Errorf("%s attribute should not be present unless configured", vgpuAttribute) + } + // Ensure we only populate non-empty values for the vgpu attribute. + if ok { + require.NotEqual(t, "", vgpu) + require.Equal(t, tc.VGPUs[i], vgpu) + } } }) } } + +func TestGetSharedGPU(t *testing.T) { + cases := []struct { + desc, deviceID string + wantVGPU string + wantOK bool + }{ + { + desc: "gke device plugin, non-mig, shared", + deviceID: "nvidia0/vgpu0", + wantVGPU: "0", + wantOK: true, + }, + { + desc: "gke device plugin, non-mig, non-shared", + deviceID: "nvidia0", + }, + { + desc: "gke device plugin, mig, shared", + deviceID: "nvidia0/gi0/vgpu1", + wantVGPU: "1", + wantOK: true, + }, + { + desc: "gke device plugin, mig, non-shared", + deviceID: "nvidia0/gi0", + }, + { + desc: "nvidia device plugin, non-mig, shared", + deviceID: "GPU-5a5a7118-e550-79a1-597e-7631e126c57a::3", + wantVGPU: "3", + wantOK: true, + }, + { + desc: "nvidia device plugin, non-mig, non-shared", + deviceID: "GPU-5a5a7118-e550-79a1-597e-7631e126c57a", + }, + { + desc: "nvidia device plugin, mig, shared", + deviceID: "MIG-42f0f413-f7b0-58cc-aced-c1d1fb54db26::0", + wantVGPU: "0", + wantOK: true, + }, + { + desc: "nvidia device plugin, mig, non-shared", + deviceID: "MIG-42f0f413-f7b0-58cc-aced-c1d1fb54db26", + }, + } + + for _, tc := range cases { + t.Run(tc.desc, func(t *testing.T) { + gotVGPU, gotOK := getSharedGPU(tc.deviceID) + if gotVGPU != tc.wantVGPU { + t.Errorf("expected: %s, got: %s", tc.wantVGPU, gotVGPU) + } + if gotOK != tc.wantOK { + t.Errorf("expected: %t, got: %t", tc.wantOK, gotOK) + } + }) + } +} diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go index 246afe02..ff527618 100644 --- a/pkg/dcgmexporter/types.go +++ b/pkg/dcgmexporter/types.go @@ -38,6 +38,7 @@ var ( podAttribute = "pod" namespaceAttribute = "namespace" containerAttribute = "container" + vgpuAttribute = "vgpu" hpcJobAttribute = "hpc_job" @@ -148,6 +149,7 @@ type PodInfo struct { Name string Namespace string Container string + VGPU string } // MetricsByCounter represents a map where each Counter is associated with a slice of Metric objects