{ "__inputs": [ { "name": "DS_PROMETHEUS", "label": "Prometheus", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__elements": {}, "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "10.3.1" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "table", "name": "Table", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "grafana", "uid": "-- Grafana --" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": null, "links": [], "liveNow": false, "panels": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 50 }, "type": "range" }, { "options": { "from": 51, "result": { "color": "green", "index": 1 }, "to": 70 }, "type": "range" }, { "options": { "from": 71, "result": { "color": "yellow", "index": 2 }, "to": 80 }, "type": "range" }, { "options": { "from": 81, "result": { "color": "red", "index": 3 }, "to": 100 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent", "value": null } ] }, "unit": "celsius", "unitScale": true }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "id": 2, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "10.3.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"4\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "title": "Machine GPU Temps", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 50 }, "type": "range" }, { "options": { "from": 51, "result": { "color": "green", "index": 1 }, "to": 70 }, "type": "range" }, { "options": { "from": 71, "result": { "color": "yellow", "index": 2 }, "to": 80 }, "type": "range" }, { "options": { "from": 81, "result": { "color": "red", "index": 3 }, "to": 100 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent", "value": null } ] }, "unit": "celsius", "unitScale": true }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "id": 5, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "10.3.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"4\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_VRAM_TEMP{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "title": "Machine GPU VRAM Temps ", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 50 }, "type": "range" }, { "options": { "from": 51, "result": { "color": "green", "index": 1 }, "to": 200 }, "type": "range" }, { "options": { "from": 201, "result": { "color": "yellow", "index": 2 }, "to": 300 }, "type": "range" }, { "options": { "from": 301, "result": { "color": "red", "index": 3 }, "to": 500 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent", "value": null } ] }, "unit": "watt", "unitScale": true }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "id": 6, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "10.3.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"4\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "title": "Machine GPU Power", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 50 }, "type": "range" }, { "options": { "from": 51, "result": { "color": "green", "index": 1 }, "to": 70 }, "type": "range" }, { "options": { "from": 71, "result": { "color": "yellow", "index": 2 }, "to": 80 }, "type": "range" }, { "options": { "from": 81, "result": { "color": "orange", "index": 3 }, "to": 90 }, "type": "range" }, { "options": { "from": 91, "result": { "color": "red", "index": 4 }, "to": 200 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent", "value": null } ] }, "unit": "celsius", "unitScale": true }, "overrides": [] }, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 8 }, "id": 8, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "10.3.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"4\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_HOT_SPOT_TEMP{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "title": "Machine GPU Core Hot Spot", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 50 }, "type": "range" }, { "options": { "from": 51, "result": { "color": "green", "index": 1 }, "to": 70 }, "type": "range" }, { "options": { "from": 71, "result": { "color": "yellow", "index": 2 }, "to": 80 }, "type": "range" }, { "options": { "from": 81, "result": { "color": "red", "index": 3 }, "to": 100 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent", "value": null } ] }, "unit": "%", "unitScale": true }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, "id": 7, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "10.3.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"4\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_FAN_SPEED{gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "title": "Machine GPU FAN Speeds", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true }, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "mode": "gradient", "type": "color-background" }, "filterable": false, "inspect": false, "minWidth": 65 }, "decimals": 0, "mappings": [ { "options": { "from": 0, "result": { "color": "blue", "index": 0 }, "to": 0 }, "type": "range" }, { "options": { "from": 1, "result": { "color": "red", "index": 1 }, "to": 100 }, "type": "range" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent", "value": null } ] }, "unit": "bool", "unitScale": true }, "overrides": [] }, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 16 }, "id": 9, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 0, "showHeader": true }, "pluginVersion": "10.3.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"0\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"1\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"2\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"3\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"4\"} ", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"5\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"6\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "DCGM_FI_DEV_CLOCKS_THROTTLE_REASON{reason=\"SwThermalSlowdown\", gpu=\"7\"}", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" } ], "title": "Machine GPU Thermal Throttle", "transformations": [ { "id": "joinByField", "options": { "byField": "job", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "DCGM_FI_DRIVER_VERSION 1": true, "DCGM_FI_DRIVER_VERSION 2": true, "DCGM_FI_DRIVER_VERSION 3": true, "DCGM_FI_DRIVER_VERSION 4": true, "Hostname 1": true, "Hostname 2": true, "Hostname 3": true, "Hostname 4": true, "Hostname 5": true, "Hostname 6": true, "Hostname 7": true, "Hostname 8": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "UUID 1": true, "UUID 2": true, "UUID 3": true, "UUID 4": true, "UUID 5": true, "UUID 6": true, "UUID 7": true, "UUID 8": true, "Value #D": false, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "__name__ 6": true, "__name__ 7": true, "__name__ 8": true, "device 1": true, "device 2": true, "device 3": true, "device 4": true, "device 5": true, "device 6": true, "device 7": true, "device 8": true, "gpu 1": true, "gpu 2": true, "gpu 3": true, "gpu 4": true, "gpu 5": true, "gpu 6": true, "gpu 7": true, "gpu 8": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "instance 7": true, "instance 8": true, "modelName 1": true, "modelName 2": true, "modelName 3": true, "modelName 4": true, "modelName 5": true, "modelName 6": true, "modelName 7": true, "modelName 8": true, "reason": true }, "includeByName": {}, "indexByName": { "DCGM_FI_DRIVER_VERSION 1": 1, "DCGM_FI_DRIVER_VERSION 2": 13, "DCGM_FI_DRIVER_VERSION 3": 22, "DCGM_FI_DRIVER_VERSION 4": 32, "Hostname 1": 5, "Hostname 2": 14, "Hostname 3": 23, "Hostname 4": 33, "Time 1": 4, "Time 2": 12, "Time 3": 21, "Time 4": 31, "UUID 1": 6, "UUID 2": 15, "UUID 3": 24, "UUID 4": 34, "Value #A": 2, "Value #B": 3, "Value #C": 30, "Value #D": 40, "__name__ 1": 7, "__name__ 2": 16, "__name__ 3": 25, "__name__ 4": 35, "device 1": 8, "device 2": 17, "device 3": 26, "device 4": 36, "gpu 1": 9, "gpu 2": 18, "gpu 3": 27, "gpu 4": 37, "instance 1": 10, "instance 2": 19, "instance 3": 28, "instance 4": 38, "job": 0, "modelName 1": 11, "modelName 2": 20, "modelName 3": 29, "modelName 4": 39 }, "renameByName": { "Hostname 1": "", "Value #A": "GPU 0", "Value #B": "GPU 1", "Value #C": "GPU 2", "Value #D": "GPU 3", "Value #E": "GPU 4", "Value #F": "GPU 5", "Value #G": "GPU 6", "Value #H": "GPU 7", "job": "Machine" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false, "minWidth": 50 }, "decimals": 1, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unitScale": true }, "overrides": [] }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 23 }, "id": 1, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "frameIndex": 4, "showHeader": true }, "pluginVersion": "10.3.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(sum by(instance) (irate(node_cpu_seconds_total{mode!=\"idle\"}[$__rate_interval])) / on(instance) group_left sum by (instance) (irate(node_cpu_seconds_total[$__rate_interval]))) * 100\r\n", "format": "table", "legendFormat": "__auto", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "node_uname_info", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(sum by(instance) (node_memory_MemTotal_bytes{} - node_memory_MemFree_bytes{} - node_memory_Buffers_bytes{} - node_memory_Cached_bytes{})) / sum by(instance) (node_memory_MemTotal_bytes{}) * 100\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(node_time_seconds - on(instance) node_boot_time_seconds ) / 60 /60", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "round(max by(instance)(rate(node_network_transmit_bytes_total[1m])*8/1000000), 0.1)\r\n\r\n\r\n\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "round(max by(instance)(rate(node_network_receive_bytes_total[1m])*8/1000000), 0.1)\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(1 - avg by(instance)(node_filesystem_avail_bytes{mountpoint=\"/\"}/node_filesystem_size_bytes{mountpoint=\"/\"})) * 100\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "G" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "(1 - avg by(instance)(node_filesystem_avail_bytes{mountpoint=\"/var/lib/docker\"}/node_filesystem_size_bytes{mountpoint=\"/var/lib/docker\"})) * 100\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "H" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", "expr": "avg by(instance)(node_cpu_temperature{})\r\n", "format": "table", "hide": false, "legendFormat": "__auto", "range": true, "refId": "I" } ], "title": "Machine Overview", "transformations": [ { "id": "joinByField", "options": { "byField": "instance", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "Time 9": true, "Value #B": true, "__name__": true, "domainname": true, "instance": true, "job": true, "machine": true, "sysname": true, "version": true }, "indexByName": { "Time 1": 4, "Time 2": 7, "Time 3": 14, "Time 4": 16, "Time 5": 18, "Time 6": 20, "Time 7": 22, "Time 8": 24, "Time 9": 26, "Value #A": 5, "Value #B": 13, "Value #C": 15, "Value #D": 17, "Value #E": 19, "Value #F": 21, "Value #G": 23, "Value #H": 25, "Value #I": 6, "__name__": 8, "domainname": 9, "instance": 3, "job": 10, "machine": 2, "nodename": 0, "release": 1, "sysname": 11, "version": 12 }, "renameByName": { "Time 2": "", "Time 5": "", "Value #A": "CPU %", "Value #C": "Memory Used %", "Value #D": "Uptime(h)", "Value #E": "Network_transmit Mbps", "Value #F": "Network_receive Mbps ", "Value #G": "Root FS Disk Usage %", "Value #H": "Docker Disk Usage %", "Value #I": "CPU Temp C", "nodename": "Machine", "release": "Kernel" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Machine" } ] } } ], "type": "table" } ], "refresh": "", "schemaVersion": 39, "tags": [ "Prometheuse", "Nvidia", "Linux" ], "templating": { "list": [] }, "time": { "from": "now-5m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "DC OverView", "uid": "b7573857-fb4d-4389-8159-edbad8b65e5b", "version": 23, "weekStart": "" }