From 5d45c96ce12f7f16c21e61db1a78e94a09c16007 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Fri, 20 Sep 2024 10:07:53 +0200 Subject: [PATCH] fix(mixin): Remove pod label from disk usage aggregation (#14180) This PR fixes the disk read and writes dashboards to support node-exporter pod name changes when node changes often (tools like karpenter for instance, autoscaling, etc.) as this causes duplicate entries when checking metrics over 1h --- .../dashboards/loki-resources-overview.json | 8 ++++---- .../dashboards/loki-reads-resources.json | 12 ++++++------ .../dashboards/loki-writes-resources.json | 4 ++-- .../dashboards/loki-reads-resources.libsonnet | 12 ++++++------ .../dashboards/loki-resources-overview.libsonnet | 8 ++++---- .../dashboards/loki-writes-resources.libsonnet | 4 ++-- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/production/loki-mixin-compiled-ssd/dashboards/loki-resources-overview.json b/production/loki-mixin-compiled-ssd/dashboards/loki-resources-overview.json index 72b6eaf785b42..3b3d7e7c0a40b 100644 --- a/production/loki-mixin-compiled-ssd/dashboards/loki-resources-overview.json +++ b/production/loki-mixin-compiled-ssd/dashboards/loki-resources-overview.json @@ -642,7 +642,7 @@ "span": 4, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null @@ -689,7 +689,7 @@ "span": 4, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null @@ -1059,7 +1059,7 @@ "span": 4, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null @@ -1106,7 +1106,7 @@ "span": 4, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null diff --git a/production/loki-mixin-compiled/dashboards/loki-reads-resources.json b/production/loki-mixin-compiled/dashboards/loki-reads-resources.json index f2c9aa7d6abe0..29c1240216659 100644 --- a/production/loki-mixin-compiled/dashboards/loki-reads-resources.json +++ b/production/loki-mixin-compiled/dashboards/loki-reads-resources.json @@ -844,7 +844,7 @@ "span": 2, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"querier\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"querier\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null @@ -891,7 +891,7 @@ "span": 2, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"querier\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance,device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"querier\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null @@ -1249,7 +1249,7 @@ "span": 2, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"index-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"index-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null @@ -1296,7 +1296,7 @@ "span": 2, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"index-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"index-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null @@ -1654,7 +1654,7 @@ "span": 2, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null @@ -1701,7 +1701,7 @@ "span": 2, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null diff --git a/production/loki-mixin-compiled/dashboards/loki-writes-resources.json b/production/loki-mixin-compiled/dashboards/loki-writes-resources.json index 8fd17e8fcbdff..b99fcf145cd36 100644 --- a/production/loki-mixin-compiled/dashboards/loki-writes-resources.json +++ b/production/loki-mixin-compiled/dashboards/loki-writes-resources.json @@ -630,7 +630,7 @@ "span": 1, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null @@ -677,7 +677,7 @@ "span": 1, "targets": [ { - "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "expr": "sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", "legendFormat": "{{pod}} - {{device}}", "legendLink": null diff --git a/production/loki-mixin/dashboards/loki-reads-resources.libsonnet b/production/loki-mixin/dashboards/loki-reads-resources.libsonnet index 2635596142d6c..b300051961f17 100644 --- a/production/loki-mixin/dashboards/loki-reads-resources.libsonnet +++ b/production/loki-mixin/dashboards/loki-reads-resources.libsonnet @@ -70,7 +70,7 @@ .addPanel( $.newQueryPanel('Disk Writes', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('querier')], + 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDiskContainer('querier')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, @@ -78,7 +78,7 @@ .addPanel( $.newQueryPanel('Disk Reads', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('querier')], + 'sum by(%s,device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDiskContainer('querier')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, @@ -102,7 +102,7 @@ .addPanel( $.newQueryPanel('Disk Writes', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(index_gateway_pod_matcher)], + 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(index_gateway_pod_matcher)], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, @@ -110,7 +110,7 @@ .addPanel( $.newQueryPanel('Disk Reads', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(index_gateway_pod_matcher)], + 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(index_gateway_pod_matcher)], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, @@ -133,7 +133,7 @@ .addPanel( $.newQueryPanel('Disk Writes', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('bloom-gateway')], + 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDiskContainer('bloom-gateway')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, @@ -141,7 +141,7 @@ .addPanel( $.newQueryPanel('Disk Reads', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('bloom-gateway')], + 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDiskContainer('bloom-gateway')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, diff --git a/production/loki-mixin/dashboards/loki-resources-overview.libsonnet b/production/loki-mixin/dashboards/loki-resources-overview.libsonnet index a93df5d42e41d..b0f7500a49456 100644 --- a/production/loki-mixin/dashboards/loki-resources-overview.libsonnet +++ b/production/loki-mixin/dashboards/loki-resources-overview.libsonnet @@ -58,7 +58,7 @@ .addPanel( $.newQueryPanel('Disk Writes', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(write_pod_matcher)], + 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(write_pod_matcher)], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, @@ -66,7 +66,7 @@ .addPanel( $.newQueryPanel('Disk Reads', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(write_pod_matcher)], + 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(write_pod_matcher)], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, @@ -92,7 +92,7 @@ .addPanel( $.newQueryPanel('Disk Writes', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(backend_pod_matcher)], + 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(backend_pod_matcher)], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, @@ -100,7 +100,7 @@ .addPanel( $.newQueryPanel('Disk Reads', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(backend_pod_matcher)], + 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(backend_pod_matcher)], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, diff --git a/production/loki-mixin/dashboards/loki-writes-resources.libsonnet b/production/loki-mixin/dashboards/loki-writes-resources.libsonnet index c19b60606cd72..2d78776422ed2 100644 --- a/production/loki-mixin/dashboards/loki-writes-resources.libsonnet +++ b/production/loki-mixin/dashboards/loki-writes-resources.libsonnet @@ -61,7 +61,7 @@ .addPanel( $.newQueryPanel('Disk Writes', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(ingester_pod_matcher)], + 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(ingester_pod_matcher)], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking, @@ -69,7 +69,7 @@ .addPanel( $.newQueryPanel('Disk Reads', 'Bps') + $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(ingester_pod_matcher)], + 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(ingester_pod_matcher)], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.withStacking,