diff --git a/CHANGELOG.md b/CHANGELOG.md index e5334fea3..89da1f331 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,15 @@ ## Changelog +### Version v0.15.0-rc.2 - Bump CUDA base image version to 12.3.2 - Add `cdi-cri` device list strategy. This uses the CDIDevices CRI field to request CDI devices instead of annotations. - Set MPS memory limit by device index and not device UUID. This is a workaround for an issue where these limits are not applied for devices if set by UUID. - Update MPS sharing to disallow requests for multiple devices if MPS sharing is configured. +- Set mps device memory limit by index. +- Explicitly set sharing.mps.failRequestsGreaterThanOne = true. +- Run tail -f for each MPS daemon to output logs. +- Enforce replica limits for MPS sharing. ### Version v0.15.0-rc.1 - Import GPU Feature Discovery into the GPU Device Plugin repo. This means that diff --git a/deployments/helm/nvidia-device-plugin/Chart.yaml b/deployments/helm/nvidia-device-plugin/Chart.yaml index 4346c86a8..bef4df79a 100644 --- a/deployments/helm/nvidia-device-plugin/Chart.yaml +++ b/deployments/helm/nvidia-device-plugin/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: nvidia-device-plugin type: application description: A Helm chart for the nvidia-device-plugin on Kubernetes -version: "0.15.0-rc.1" -appVersion: "0.15.0-rc.1" +version: "0.15.0-rc.2" +appVersion: "0.15.0-rc.2" kubeVersion: ">= 1.10.0-0" home: https://github.com/NVIDIA/k8s-device-plugin diff --git a/deployments/static/gpu-feature-discovery-daemonset-with-mig-mixed.yaml b/deployments/static/gpu-feature-discovery-daemonset-with-mig-mixed.yaml index 6a8d91cb5..de0e50ae2 100644 --- a/deployments/static/gpu-feature-discovery-daemonset-with-mig-mixed.yaml +++ b/deployments/static/gpu-feature-discovery-daemonset-with-mig-mixed.yaml @@ -4,7 +4,7 @@ metadata: name: gpu-feature-discovery labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.14.2 + app.kubernetes.io/version: 0.15.0-rc.2 app.kubernetes.io/part-of: nvidia-gpu spec: selector: @@ -15,11 +15,11 @@ spec: metadata: labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.14.2 + app.kubernetes.io/version: 0.15.0-rc.2 app.kubernetes.io/part-of: nvidia-gpu spec: containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.3 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 name: gpu-feature-discovery command: ["/usr/bin/gpu-feature-discovery"] volumeMounts: diff --git a/deployments/static/gpu-feature-discovery-daemonset-with-mig-single.yaml b/deployments/static/gpu-feature-discovery-daemonset-with-mig-single.yaml index dd7ff308c..c6a52ab03 100644 --- a/deployments/static/gpu-feature-discovery-daemonset-with-mig-single.yaml +++ b/deployments/static/gpu-feature-discovery-daemonset-with-mig-single.yaml @@ -4,7 +4,7 @@ metadata: name: gpu-feature-discovery labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.14.2 + app.kubernetes.io/version: 0.15.0-rc.2 app.kubernetes.io/part-of: nvidia-gpu spec: selector: @@ -15,11 +15,11 @@ spec: metadata: labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.14.2 + app.kubernetes.io/version: 0.15.0-rc.2 app.kubernetes.io/part-of: nvidia-gpu spec: containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.3 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 name: gpu-feature-discovery command: ["/usr/bin/gpu-feature-discovery"] volumeMounts: diff --git a/deployments/static/gpu-feature-discovery-daemonset.yaml b/deployments/static/gpu-feature-discovery-daemonset.yaml index 38cce2d9c..8eb54e94b 100644 --- a/deployments/static/gpu-feature-discovery-daemonset.yaml +++ b/deployments/static/gpu-feature-discovery-daemonset.yaml @@ -4,7 +4,7 @@ metadata: name: gpu-feature-discovery labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.14.2 + app.kubernetes.io/version: 0.15.0-rc.2 app.kubernetes.io/part-of: nvidia-gpu spec: selector: @@ -15,11 +15,11 @@ spec: metadata: labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.14.2 + app.kubernetes.io/version: 0.15.0-rc.2 app.kubernetes.io/part-of: nvidia-gpu spec: containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.3 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 name: gpu-feature-discovery command: ["/usr/bin/gpu-feature-discovery"] volumeMounts: diff --git a/deployments/static/gpu-feature-discovery-job.yaml.template b/deployments/static/gpu-feature-discovery-job.yaml.template index 7baa97e90..43ece7671 100644 --- a/deployments/static/gpu-feature-discovery-job.yaml.template +++ b/deployments/static/gpu-feature-discovery-job.yaml.template @@ -4,19 +4,19 @@ metadata: name: gpu-feature-discovery labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.14.2 + app.kubernetes.io/version: 0.15.0-rc.2 app.kubernetes.io/part-of: nvidia-gpu spec: template: metadata: labels: app.kubernetes.io/name: gpu-feature-discovery - app.kubernetes.io/version: 0.14.2 + app.kubernetes.io/version: 0.15.0-rc.2 app.kubernetes.io/part-of: nvidia-gpu spec: nodeName: NODE_NAME containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.3 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 name: gpu-feature-discovery command: ["/usr/bin/gpu-feature-discovery"] args: diff --git a/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml b/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml index 15ad02695..25c320fe2 100644 --- a/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml +++ b/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml @@ -38,7 +38,7 @@ spec: # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.4 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR diff --git a/deployments/static/nvidia-device-plugin-privileged-with-service-account.yml b/deployments/static/nvidia-device-plugin-privileged-with-service-account.yml index f453adb4c..9ecd71c2f 100644 --- a/deployments/static/nvidia-device-plugin-privileged-with-service-account.yml +++ b/deployments/static/nvidia-device-plugin-privileged-with-service-account.yml @@ -124,7 +124,7 @@ spec: - env: - name: PASS_DEVICE_SPECS value: "true" - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.1 + image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 name: nvidia-device-plugin-ctr securityContext: privileged: true diff --git a/deployments/static/nvidia-device-plugin.yml b/deployments/static/nvidia-device-plugin.yml index d612f06b0..f750d79e4 100644 --- a/deployments/static/nvidia-device-plugin.yml +++ b/deployments/static/nvidia-device-plugin.yml @@ -38,7 +38,7 @@ spec: # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.1 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR diff --git a/nvidia-device-plugin.yml b/nvidia-device-plugin.yml index d612f06b0..f750d79e4 100644 --- a/nvidia-device-plugin.yml +++ b/nvidia-device-plugin.yml @@ -38,7 +38,7 @@ spec: # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.1 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR diff --git a/versions.mk b/versions.mk index 2c550f226..5b9f2c67b 100644 --- a/versions.mk +++ b/versions.mk @@ -17,7 +17,7 @@ MODULE := github.com/NVIDIA/$(DRIVER_NAME) REGISTRY ?= nvcr.io/nvidia -VERSION ?= v0.15.0-rc.1 +VERSION ?= v0.15.0-rc.2 # vVERSION represents the version with a guaranteed v-prefix vVERSION := v$(VERSION:v%=%)