NVIDIA · elezar · Mar 16, 2024 · Mar 15, 2024 · Mar 16, 2024 · Mar 15, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,15 @@
 ## Changelog
 
+### Version v0.15.0-rc.2
 - Bump CUDA base image version to 12.3.2
 - Add `cdi-cri` device list strategy. This uses the CDIDevices CRI field to request CDI devices instead of annotations.
 - Set MPS memory limit by device index and not device UUID. This is a workaround for an issue where
   these limits are not applied for devices if set by UUID.
 - Update MPS sharing to disallow requests for multiple devices if MPS sharing is configured.
+- Set mps device memory limit by index.
+- Explicitly set sharing.mps.failRequestsGreaterThanOne = true.
+- Run tail -f for each MPS daemon to output logs.
+- Enforce replica limits for MPS sharing.
 
 ### Version v0.15.0-rc.1
 - Import GPU Feature Discovery into the GPU Device Plugin repo. This means that

diff --git a/deployments/helm/nvidia-device-plugin/Chart.yaml b/deployments/helm/nvidia-device-plugin/Chart.yaml
@@ -2,8 +2,8 @@ apiVersion: v2
 name: nvidia-device-plugin
 type: application
 description: A Helm chart for the nvidia-device-plugin on Kubernetes
-version: "0.15.0-rc.1"
-appVersion: "0.15.0-rc.1"
+version: "0.15.0-rc.2"
+appVersion: "0.15.0-rc.2"
 kubeVersion: ">= 1.10.0-0"
 home: https://github.com/NVIDIA/k8s-device-plugin
 

diff --git a/deployments/static/gpu-feature-discovery-daemonset-with-mig-mixed.yaml b/deployments/static/gpu-feature-discovery-daemonset-with-mig-mixed.yaml
@@ -4,7 +4,7 @@ metadata:
   name: gpu-feature-discovery
   labels:
     app.kubernetes.io/name: gpu-feature-discovery
-    app.kubernetes.io/version: 0.14.2
+    app.kubernetes.io/version: 0.15.0-rc.2
     app.kubernetes.io/part-of: nvidia-gpu
 spec:
   selector:
@@ -15,11 +15,11 @@ spec:
     metadata:
       labels:
         app.kubernetes.io/name: gpu-feature-discovery
-        app.kubernetes.io/version: 0.14.2
+        app.kubernetes.io/version: 0.15.0-rc.2
         app.kubernetes.io/part-of: nvidia-gpu
     spec:
       containers:
-        - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.3
+        - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
           name: gpu-feature-discovery
           command: ["/usr/bin/gpu-feature-discovery"]
           volumeMounts:

diff --git a/deployments/static/gpu-feature-discovery-daemonset-with-mig-single.yaml b/deployments/static/gpu-feature-discovery-daemonset-with-mig-single.yaml
@@ -4,7 +4,7 @@ metadata:
   name: gpu-feature-discovery
   labels:
     app.kubernetes.io/name: gpu-feature-discovery
-    app.kubernetes.io/version: 0.14.2
+    app.kubernetes.io/version: 0.15.0-rc.2
     app.kubernetes.io/part-of: nvidia-gpu
 spec:
   selector:
@@ -15,11 +15,11 @@ spec:
     metadata:
       labels:
         app.kubernetes.io/name: gpu-feature-discovery
-        app.kubernetes.io/version: 0.14.2
+        app.kubernetes.io/version: 0.15.0-rc.2
         app.kubernetes.io/part-of: nvidia-gpu
     spec:
       containers:
-        - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.3
+        - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
           name: gpu-feature-discovery
           command: ["/usr/bin/gpu-feature-discovery"]
           volumeMounts:

diff --git a/deployments/static/gpu-feature-discovery-daemonset.yaml b/deployments/static/gpu-feature-discovery-daemonset.yaml
@@ -4,7 +4,7 @@ metadata:
   name: gpu-feature-discovery
   labels:
     app.kubernetes.io/name: gpu-feature-discovery
-    app.kubernetes.io/version: 0.14.2
+    app.kubernetes.io/version: 0.15.0-rc.2
     app.kubernetes.io/part-of: nvidia-gpu
 spec:
   selector:
@@ -15,11 +15,11 @@ spec:
     metadata:
       labels:
         app.kubernetes.io/name: gpu-feature-discovery
-        app.kubernetes.io/version: 0.14.2
+        app.kubernetes.io/version: 0.15.0-rc.2
         app.kubernetes.io/part-of: nvidia-gpu
     spec:
       containers:
-        - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.3
+        - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
           name: gpu-feature-discovery
           command: ["/usr/bin/gpu-feature-discovery"]
           volumeMounts:

diff --git a/deployments/static/gpu-feature-discovery-job.yaml.template b/deployments/static/gpu-feature-discovery-job.yaml.template
@@ -4,19 +4,19 @@ metadata:
   name: gpu-feature-discovery
   labels:
     app.kubernetes.io/name: gpu-feature-discovery
-    app.kubernetes.io/version: 0.14.2
+    app.kubernetes.io/version: 0.15.0-rc.2
     app.kubernetes.io/part-of: nvidia-gpu
 spec:
   template:
     metadata:
       labels:
         app.kubernetes.io/name: gpu-feature-discovery
-        app.kubernetes.io/version: 0.14.2
+        app.kubernetes.io/version: 0.15.0-rc.2
         app.kubernetes.io/part-of: nvidia-gpu
     spec:
       nodeName: NODE_NAME
       containers:
-        - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.3
+        - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
           name: gpu-feature-discovery
           command: ["/usr/bin/gpu-feature-discovery"]
           args:

diff --git a/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml b/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml
@@ -38,7 +38,7 @@ spec:
       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
       priorityClassName: "system-node-critical"
       containers:
-      - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.4
+      - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
         name: nvidia-device-plugin-ctr
         env:
           - name: FAIL_ON_INIT_ERROR

diff --git a/deployments/static/nvidia-device-plugin-privileged-with-service-account.yml b/deployments/static/nvidia-device-plugin-privileged-with-service-account.yml
@@ -124,7 +124,7 @@ spec:
         - env:
             - name: PASS_DEVICE_SPECS
               value: "true"
-          image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.1
+          image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
           name: nvidia-device-plugin-ctr
           securityContext:
             privileged: true

diff --git a/deployments/static/nvidia-device-plugin.yml b/deployments/static/nvidia-device-plugin.yml
@@ -38,7 +38,7 @@ spec:
       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
       priorityClassName: "system-node-critical"
       containers:
-      - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.1
+      - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
         name: nvidia-device-plugin-ctr
         env:
           - name: FAIL_ON_INIT_ERROR

diff --git a/nvidia-device-plugin.yml b/nvidia-device-plugin.yml
@@ -38,7 +38,7 @@ spec:
       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
       priorityClassName: "system-node-critical"
       containers:
-      - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.1
+      - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
         name: nvidia-device-plugin-ctr
         env:
           - name: FAIL_ON_INIT_ERROR

diff --git a/versions.mk b/versions.mk
@@ -17,7 +17,7 @@ MODULE := github.com/NVIDIA/$(DRIVER_NAME)
 
 REGISTRY ?= nvcr.io/nvidia
 
-VERSION  ?= v0.15.0-rc.1
+VERSION  ?= v0.15.0-rc.2
 
 # vVERSION represents the version with a guaranteed v-prefix
 vVERSION := v$(VERSION:v%=%)