diff --git a/autopilot/base/clusterrolebindings/autopilot-privileged.yaml b/autopilot/base/clusterrolebindings/autopilot-privileged.yaml new file mode 100644 index 00000000..ca2a5d9e --- /dev/null +++ b/autopilot/base/clusterrolebindings/autopilot-privileged.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: autopilot-privileged + namespace: autopilot +subjects: +- kind: ServiceAccount + name: autopilot + namespace: autopilot +roleRef: + kind: ClusterRole + name: system:openshift:scc:privileged + apiGroup: rbac.authorization.k8s.io diff --git a/autopilot/base/clusterrolebindings/kustomization.yaml b/autopilot/base/clusterrolebindings/kustomization.yaml index 5742f7e4..98e84e4f 100644 --- a/autopilot/base/clusterrolebindings/kustomization.yaml +++ b/autopilot/base/clusterrolebindings/kustomization.yaml @@ -2,4 +2,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - autopilot.yaml +- autopilot-privileged.yaml - prometheus-k8s-autopilot.yaml diff --git a/autopilot/base/daemonsets/autopilot.yaml b/autopilot/base/daemonsets/autopilot.yaml index 1dee78cd..a548b2ac 100644 --- a/autopilot/base/daemonsets/autopilot.yaml +++ b/autopilot/base/daemonsets/autopilot.yaml @@ -11,51 +11,47 @@ spec: app: autopilot template: metadata: - annotations: - null labels: app: autopilot spec: + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu.product + operator: Equal + value: NVIDIA-A100-SXM4-40GB + - effect: NoSchedule + key: nvidia.com/gpu.product + operator: Equal + value: Tesla-V100-PCIE-32GB nodeSelector: nvidia.com/gpu.present: 'true' serviceAccountName: autopilot + securityContext: {} initContainers: - args: - - | - until [ -f /usr/bin/nvidia-smi ]; do echo waiting for nvidia device plug-in to be setup; sleep 5 && exit -1; done + - until [ -f /usr/bin/nvidia-smi ]; do echo waiting for nvidia device plug-in to be setup; sleep 5 && exit -1; done command: - sh - -c - image: quay.io/autopilot/autopilot:v1.9.0 + image: quay.io/autopilot/autopilot:v2.1.0 imagePullPolicy: Always name: device-plugin-validation securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true + privileged: true containers: - - image: quay.io/autopilot/autopilot:v1.9.0 + - image: quay.io/autopilot/autopilot:v2.1.0 command: - - sh - - -c - - | - iperf3 -s -p 6310 -D - /usr/local/bin/autopilot --port 3333 --loglevel=2 --bw 4 --w 1 --invasive-check-timer 4 + - sh + - -c + - | + /usr/local/bin/autopilot --port 3333 --loglevel=2 --bw 4 --w 1 --invasive-check-timer 4 imagePullPolicy: Always name: autopilot - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - runAsNonRoot: true env: - name: PERIODIC_CHECKS value: pciebw,remapped,dcgm,ping,gpupower - name: PVC_TEST_STORAGE_CLASS - value: + value: '' - name: "NODE_NAME" valueFrom: fieldRef: @@ -91,8 +87,8 @@ spec: - nvidia-smi resources: limits: - nvidia.com/gpu: 0 + nvidia.com/gpu: '0' requests: - nvidia.com/gpu: 0 - volumeMounts: [] - volumes: [] + nvidia.com/gpu: '0' + securityContext: + privileged: true diff --git a/autopilot/base/servicemonitors/autopilot-metrics-monitor.yaml b/autopilot/base/servicemonitors/autopilot-metrics-monitor.yaml index a908d8c6..7eed74fd 100644 --- a/autopilot/base/servicemonitors/autopilot-metrics-monitor.yaml +++ b/autopilot/base/servicemonitors/autopilot-metrics-monitor.yaml @@ -15,3 +15,4 @@ spec: selector: matchLabels: app: autopilot + service: autopilot-metrics-service diff --git a/autopilot/base/services/autopilot-healthchecks.yaml b/autopilot/base/services/autopilot-healthchecks.yaml index 9c61e4b3..476ffc56 100644 --- a/autopilot/base/services/autopilot-healthchecks.yaml +++ b/autopilot/base/services/autopilot-healthchecks.yaml @@ -3,6 +3,7 @@ kind: Service metadata: labels: app: autopilot + service: autopilot-healthchecks name: autopilot-healthchecks namespace: autopilot annotations: diff --git a/autopilot/base/services/autopilot-metrics-service.yaml b/autopilot/base/services/autopilot-metrics-service.yaml index 81938429..3ad734a1 100644 --- a/autopilot/base/services/autopilot-metrics-service.yaml +++ b/autopilot/base/services/autopilot-metrics-service.yaml @@ -3,6 +3,7 @@ kind: Service metadata: labels: app: autopilot + service: autopilot-metrics-service name: autopilot-metrics-service namespace: autopilot spec: diff --git a/autopilot/base/services/autopilot.yaml b/autopilot/base/services/autopilot-readinessprobe.yaml similarity index 86% rename from autopilot/base/services/autopilot.yaml rename to autopilot/base/services/autopilot-readinessprobe.yaml index 41aba839..16a43eca 100644 --- a/autopilot/base/services/autopilot.yaml +++ b/autopilot/base/services/autopilot-readinessprobe.yaml @@ -3,6 +3,7 @@ kind: Service metadata: labels: app: autopilot + service: autopilot-readinessprobe name: autopilot-readinessprobe namespace: autopilot spec: diff --git a/autopilot/base/services/kustomization.yaml b/autopilot/base/services/kustomization.yaml index c9b28410..da412a88 100644 --- a/autopilot/base/services/kustomization.yaml +++ b/autopilot/base/services/kustomization.yaml @@ -3,4 +3,4 @@ kind: Kustomization resources: - autopilot-metrics-service.yaml - autopilot-healthchecks.yaml -- autopilot.yaml +- autopilot-readinessprobe.yaml diff --git a/autopilot/observability/grafanadashboards/autopilot.yaml b/autopilot/observability/grafanadashboards/autopilot.yaml index 8c954296..f4654ffa 100644 --- a/autopilot/observability/grafanadashboards/autopilot.yaml +++ b/autopilot/observability/grafanadashboards/autopilot.yaml @@ -12,11 +12,49 @@ spec: folder: IBM autopilot json: | { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.5.17" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { + "$$hashKey": "object:192", "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": "observability-metrics", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", @@ -25,249 +63,261 @@ spec: } ] }, + "description": "This dashboard displays Nodes health and utilization", "editable": true, - "gnetId": null, + "fiscalYearStartMonth": 0, + "gnetId": 12239, "graphTooltip": 0, - "id": 1, - "iteration": 1689354875983, + "id": null, "links": [], + "liveNow": false, "panels": [ { - "collapsed": false, "datasource": "observability-metrics", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 22, - "panels": [], - "title": "Single Node Stats", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": null, + "description": "Autopilot labeled nodes with unhealthy GPUs", "fieldConfig": { - "defaults": {}, + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "All GPU nodes are healthy", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, "overrides": [] }, - "fill": 1, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 1 - }, - "hiddenSeries": false, - "id": 20, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false + "y": 0 }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", + "id": 67, "options": { - "alertThreshold": true + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "autopilot_health_checks{cluster=\"$cluster\",node=\"$node\",health=\"$health\",deviceid=\"$deviceid\"}", - "interval": "", - "legendFormat": "{{ health }} for device {{ deviceid }} on {{ node }}", + "datasource": "observability-metrics", + "editorMode": "code", + "expr": "kube_node_labels{label_autopilot_ibm_com_gpuhealth=~\".*ERR.*\"} and kube_node_labels{label_autopilot_ibm_com_gpuhealth!~\"\"}", + "legendFormat": "__auto", + "range": true, "refId": "A" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Single Node Metrics", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "collapsed": false, - "datasource": "observability-metrics", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 9 - }, - "id": 24, - "panels": [], - "title": "All Nodes", - "type": "row" + "title": "Nodes with unhealthy GPUs", + "type": "timeseries" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "observability-metrics", + "description": "", "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "noValue": "GPUs are healthy", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + { + "id": "custom.width", + "value": 465 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "deviceid" + }, + "properties": [ + { + "id": "custom.width", + "value": 90 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ + { + "id": "custom.width", + "value": 249 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.width", + "value": 228 + } + ] + } + ] }, - "fill": 1, - "fillGradient": 3, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false + "x": 12, + "y": 0 }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", + "id": 53, "options": { - "alertThreshold": true + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "pluginVersion": "9.5.17", "targets": [ { + "editorMode": "code", "exemplar": true, - "expr": "autopilot_health_checks{health=\"pciebw\"}", - "hide": false, - "interval": "", - "legendFormat": "GPU {{ deviceid }} - {{ node }}", - "refId": "A" - }, - { - "exemplar": true, - "expr": "autopilot_health_checks{health=\"pciebw\"cluster=\"$cluster\",,node=\"$node\"}", - "hide": true, + "expr": "sum(autopilot_health_checks{node=~\"$node\", health=~\"power-slowdown\"} > 0) by (node, deviceid)", + "format": "table", "instant": true, "interval": "", - "legendFormat": "GPU {{ deviceid }} - {{ node }}", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "PCIe Bandwidths (Gauge)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "legendFormat": "", + "refId": "A" } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "title": "GPU HW Power Brake Slowdown Active", + "type": "table" }, { - "datasource": "observability-metrics", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, "mappings": [], + "noValue": "Idle GPUs Nodes (if any) are healthy", "thresholds": { "mode": "absolute", "steps": [ @@ -277,57 +327,127 @@ spec: }, { "color": "red", - "value": "" + "value": 80 } ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + } + ] + } + ] }, "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 18 + "h": 7, + "w": 12, + "x": 12, + "y": 7 }, - "id": 14, + "id": 63, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], + "cellHeight": "sm", + "footer": { + "countRows": false, "fields": "", - "values": false + "reducer": [ + "sum" + ], + "show": false }, - "text": {}, - "textMode": "auto" + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "Value" + } + ] }, - "pluginVersion": "7.5.17", + "pluginVersion": "9.5.17", "targets": [ { + "editorMode": "code", "exemplar": true, - "expr": "autopilot_health_checks{health=\"pciebw\"} < 4", + "expr": "sum(autopilot_health_checks{health=~\"remapped\"}) by (node) > 0 and on (node) count(label_replace(DCGM_FI_DEV_GPU_UTIL{exported_namespace=~\"\"}==0, \"node\", \"$1\", \"Hostname\", \"(.+)\")) by (node) == 8", + "format": "table", + "hide": false, + "instant": true, "interval": "", - "legendFormat": "GPU {{deviceid}} - {{node}}", + "legendFormat": "", "refId": "A" } ], - "title": "GPUs with low PCIeBW (less than 4 GB/s)", - "type": "stat" + "title": "Idle GPU nodes with GPU Pending Row-Remapping (Sum on a node)", + "type": "table" }, { "datasource": "observability-metrics", - "description": "", + "description": "Checks if any autopilot pods died", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, "mappings": [], + "noValue": "All autopilot pods are healthy", "thresholds": { "mode": "absolute", "steps": [ @@ -337,19 +457,10 @@ spec: }, { "color": "red", - "value": 0 - }, - { - "color": "#EAB839", - "value": 1 - }, - { - "color": "green", - "value": 2 + "value": 80 } ] - }, - "unit": "string" + } }, "overrides": [] }, @@ -357,341 +468,474 @@ spec: "h": 8, "w": 12, "x": 0, - "y": 27 + "y": 8 }, - "id": 4, + "id": 66, "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "text": {} + "tooltip": { + "mode": "single", + "sort": "none" + } }, - "pluginVersion": "7.5.17", "targets": [ { - "exemplar": true, - "expr": "autopilot_health_report_total{health=\"netdevice\"}", - "format": "time_series", - "hide": true, - "interval": "", - "legendFormat": "NIC {{ deviceid }}", - "refId": "A" - }, - { - "exemplar": true, - "expr": "group (avg(autopilot_health_report_total{health=\"netdevice\"})) by (node))", - "hide": true, - "interval": "", - "legendFormat": "{{ node }} ", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(autopilot_health_checks{health=\"net-reach\"})by(node) < 2", + "datasource": "observability-metrics", + "editorMode": "code", + "expr": "(kube_pod_info and on (pod) (kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", namespace=~\"autopilot.*\"} > 0 or kube_pod_container_status_terminated_reason{reason=~\"Error\", namespace=~\"autopilot.*\"} > 0))", "hide": false, - "interval": "", - "legendFormat": "{{ node }}", - "refId": "C" + "legendFormat": "__auto", + "range": true, + "refId": "A" } ], - "timeFrom": null, - "timeShift": null, - "title": "Faulty Secondary NIC by Node", - "type": "gauge" + "title": "Autopilot Pod Health", + "type": "timeseries" }, { "datasource": "observability-metrics", - "description": "Checks remapped rows on GPUs. If no remapped rows, then value is 0. 1 otherwise.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, + "custom": { + "align": "right", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, "mappings": [], + "noValue": "All nodes can be reached with PING", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } }, { - "color": "red", - "value": 1 + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 0 + } + ] + } } ] } - }, - "overrides": [] + ] }, "gridPos": { - "h": 8, + "h": 10, "w": 12, "x": 12, - "y": 27 + "y": 14 }, - "id": 8, + "id": 64, "options": { - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], + "cellHeight": "sm", + "footer": { + "countRows": false, "fields": "", - "values": false + "reducer": [ + "sum" + ], + "show": false }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "text": {} + "showHeader": true, + "sortBy": [] }, - "pluginVersion": "7.5.17", + "pluginVersion": "9.5.17", "targets": [ { - "exemplar": true, - "expr": "sum(autopilot_health_checks{health=\"remapped\"})by(node)>0", - "hide": false, + "datasource": "observability-metrics", + "exemplar": false, + "expr": "sum(label_replace(autopilot_health_checks{health=\"ping\",node=\"$node\"}, \"targetNode\", \"$1\", \"deviceid\", \"(.*)\"))by(node,targetNode)", + "format": "table", + "instant": true, "interval": "", - "legendFormat": " {{ node }}", - "refId": "B" + "legendFormat": "", + "refId": "A" } ], - "timeFrom": null, - "timeShift": null, - "title": "Faulty Remapped Rows by Node", - "type": "gauge" + "title": "PING Failures", + "type": "table" }, { - "collapsed": true, "datasource": "observability-metrics", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 35 - }, - "id": 26, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "observability-metrics", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 11, - "x": 0, - "y": 27 - }, - "hiddenSeries": false, - "id": 12, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true + "description": "Autopilot test to check if a PVC can be created", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum_over_time(scrape_series_added{job=\"autopilot-metrics-service\"}[1h]) ", - "interval": "", - "legendFormat": "{{pod}}", - "refId": "A" + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Approximate number of new series in this scrape", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "mappings": [], + "noValue": "PVCs can be created", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "pokprod-b93r43s2" + ], + "prefix": "All except:", + "readOnly": true + } }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 65, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, "datasource": "observability-metrics", - "fieldConfig": { - "defaults": {}, - "overrides": [] + "editorMode": "code", + "exemplar": false, + "expr": "sum (autopilot_health_checks{health=\"pvc\"}==1) by (node)", + "format": "time_series", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "PVC Test", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 27 + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false }, - "hiddenSeries": false, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false + "mappings": [], + "noValue": "PCIe BW normal", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red" + } + ] }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "red" + }, + { + "color": "green", + "value": 3.5 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + } + ] }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "topk(10, sum without(instance)(sum_over_time(scrape_series_added[1h])))", - "hide": true, - "interval": "", - "legendFormat": "", - "refId": "A" + { + "matcher": { + "id": "byName", + "options": "node" }, - { - "exemplar": true, - "expr": "sum_over_time(scrape_samples_scraped{job=\"autopilot-metrics-service\"}[1h])", - "hide": false, - "interval": "", - "legendFormat": "{{pod}}", - "refId": "C" - } + "properties": [ + { + "id": "custom.width", + "value": 273 + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 59, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Number of samples exposed", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "9.5.17", + "targets": [ + { + "exemplar": true, + "expr": "avg(avg_over_time(autopilot_health_checks{health=~\"pciebw\"}[12h])) by (node, deviceid) < 3.4 ", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + }, + { + "exemplar": true, + "expr": "avg(avg_over_time(autopilot_health_checks{health=~\"pciebw\", node=~\"$node\"}[12h])) by (node, deviceid) < 5", + "hide": true, + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "B" + } + ], + "title": "Nodes with Low PCIe Bandwidth (<3.4 GB/s)", + "type": "table" + }, + { + "datasource": "observability-metrics", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + "custom": { + "align": "right", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "mappings": [], + "noValue": "GPUs are healthy", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 1 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 47, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" ], - "yaxis": { - "align": false, - "alignLevel": null + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" } + ] + }, + "pluginVersion": "9.5.17", + "targets": [ + { + "datasource": "observability-metrics", + "exemplar": true, + "expr": "sum(autopilot_health_checks{health=~\"remapped\", node=~\"$node\"}) by (node) > 0", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" } ], - "title": "Timeseries Stats", - "type": "row" + "title": "GPU Pending Row-Remapping (Sum on a node)", + "type": "table" } ], - "refresh": "1m", - "schemaVersion": 27, + "refresh": "", + "schemaVersion": 38, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": "", + "current": {}, "datasource": "observability-metrics", "definition": "label_values(autopilot_health_checks, cluster)", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "Cluster", @@ -707,17 +951,15 @@ spec: "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": "", + "current": {}, "datasource": "observability-metrics", "definition": "label_values(autopilot_health_checks, node)", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "Node", @@ -729,34 +971,28 @@ spec: "refId": "StandardVariableQuery" }, "refresh": 1, - "regex": "/wrk-*/", + "regex": "", "skipUrlSync": false, - "sort": 0, + "sort": 1, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": "", - "current": { - "selected": false, - "text": "pciebw", - "value": "pciebw" - }, "datasource": "observability-metrics", - "definition": "label_values(autopilot_health_checks, health)", - "description": null, + "definition": "label_values(autopilot_health_checks, deviceid)", + "description": "GPU (0-7)", "error": null, "hide": 0, - "includeAll": false, - "label": "Health Check", - "multi": false, - "name": "health", + "includeAll": true, + "label": "Device ID", + "multi": true, + "name": "deviceid", "options": [], "query": { - "query": "label_values(autopilot_health_checks, health)", + "query": "label_values(autopilot_health_checks, deviceid)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -771,26 +1007,24 @@ spec: }, { "allValue": "", + "current": {}, "datasource": "observability-metrics", - "definition": "label_values(autopilot_health_checks, deviceid)", - "description": "GPU (0-7)", - "error": null, + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace!~\"openshift-.*|ibm-.*|nvidia-.*|kube-system|kubeflow|kyverno|mpi-operator|scheduler-plugins|.*grafana.*\"}, namespace)", "hide": 0, "includeAll": true, - "label": "Device ID", + "label": "Namespace", "multi": true, - "name": "deviceid", + "name": "namespace", "options": [], "query": { - "query": "label_values(autopilot_health_checks, deviceid)", - "refId": "StandardVariableQuery" + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace!~\"openshift-.*|ibm-.*|nvidia-.*|kube-system|kubeflow|kyverno|mpi-operator|scheduler-plugins|.*grafana.*\"}, namespace)", + "refId": "Prometheus-instance-Variable-Query" }, "refresh": 1, "regex": "", "skipUrlSync": false, - "sort": 0, + "sort": 3, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -798,12 +1032,26 @@ spec: ] }, "time": { - "from": "now-30m", + "from": "now-5m", "to": "now" }, - "timepicker": {}, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, "timezone": "", - "title": "Autopilot", - "uid": "Ny3de_UVz", - "version": 8 + "title": "Autopilot Metrics", + "uid": "Oxed_c6W3", + "version": 4, + "weekStart": "" }