Skip to content

Commit

Permalink
Upgrading autopilot to version v2.1.0
Browse files Browse the repository at this point in the history
This is for the latest GPU health checks from IBM.
  • Loading branch information
computate committed Feb 3, 2025
1 parent e6b26ef commit ff74fdc
Show file tree
Hide file tree
Showing 7 changed files with 798 additions and 554 deletions.
48 changes: 20 additions & 28 deletions autopilot/base/daemonsets/autopilot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,51 +11,45 @@ spec:
app: autopilot
template:
metadata:
annotations:
null
labels:
app: autopilot
spec:
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu.product
operator: Equal
value: NVIDIA-A100-SXM4-40GB
- effect: NoSchedule
key: nvidia.com/gpu.product
operator: Equal
value: Tesla-V100-PCIE-32GB
nodeSelector:
nvidia.com/gpu.present: 'true'
serviceAccountName: autopilot
securityContext: {}
initContainers:
- args:
- |
until [ -f /usr/bin/nvidia-smi ]; do echo waiting for nvidia device plug-in to be setup; sleep 5 && exit -1; done
- until [ -f /usr/bin/nvidia-smi ]; do echo waiting for nvidia device plug-in to be setup; sleep 5 && exit -1; done
command:
- sh
- -c
image: quay.io/autopilot/autopilot:v1.9.0
image: quay.io/autopilot/autopilot:v2.1.0
imagePullPolicy: Always
name: device-plugin-validation
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
runAsNonRoot: true
containers:
- image: quay.io/autopilot/autopilot:v1.9.0
- image: quay.io/autopilot/autopilot:v2.1.0
command:
- sh
- -c
- |
iperf3 -s -p 6310 -D
/usr/local/bin/autopilot --port 3333 --loglevel=2 --bw 4 --w 1 --invasive-check-timer 4
- sh
- -c
- |
/usr/local/bin/autopilot --port 3333 --loglevel=2 --bw 4 --w 1 --invasive-check-timer 4
imagePullPolicy: Always
name: autopilot
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
runAsNonRoot: true
env:
- name: PERIODIC_CHECKS
value: pciebw,remapped,dcgm,ping,gpupower
- name: PVC_TEST_STORAGE_CLASS
value:
value: ''
- name: "NODE_NAME"
valueFrom:
fieldRef:
Expand Down Expand Up @@ -91,8 +85,6 @@ spec:
- nvidia-smi
resources:
limits:
nvidia.com/gpu: 0
nvidia.com/gpu: '0'
requests:
nvidia.com/gpu: 0
volumeMounts: []
volumes: []
nvidia.com/gpu: '0'
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ spec:
selector:
matchLabels:
app: autopilot
service: autopilot-metrics-service
1 change: 1 addition & 0 deletions autopilot/base/services/autopilot-healthchecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ kind: Service
metadata:
labels:
app: autopilot
service: autopilot-healthchecks
name: autopilot-healthchecks
namespace: autopilot
annotations:
Expand Down
1 change: 1 addition & 0 deletions autopilot/base/services/autopilot-metrics-service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ kind: Service
metadata:
labels:
app: autopilot
service: autopilot-metrics-service
name: autopilot-metrics-service
namespace: autopilot
spec:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ kind: Service
metadata:
labels:
app: autopilot
service: autopilot-readinessprobe
name: autopilot-readinessprobe
namespace: autopilot
spec:
Expand Down
2 changes: 1 addition & 1 deletion autopilot/base/services/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ kind: Kustomization
resources:
- autopilot-metrics-service.yaml
- autopilot-healthchecks.yaml
- autopilot.yaml
- autopilot-readinessprobe.yaml
Loading

0 comments on commit ff74fdc

Please sign in to comment.