From 8fba27bed75dcf07543b7ed5b8ed13758dd6fbba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 17 Jul 2024 13:55:14 +0200 Subject: [PATCH 01/13] mixin: add native cortex_request_duration_seconds in Reads dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../metamonitoring/grafana-dashboards.yaml | 69 +++++++++++++++++-- .../dashboards/mimir-reads.json | 69 +++++++++++++++++-- .../dashboards/mimir-reads.json | 69 +++++++++++++++++-- .../dashboards/dashboard-queries.libsonnet | 12 ++-- .../mimir-mixin/dashboards/reads.libsonnet | 34 +++------ operations/mimir-mixin/jsonnetfile.lock.json | 8 +-- 6 files changed, 214 insertions(+), 47 deletions(-) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index 34c312da392..7fe6344628a 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -17720,7 +17720,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n rate(\n cortex_request_duration_seconds_count{\n cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",\n route=~\"(prometheus|api_prom)_api_v1_query\"\n }[$__rate_interval]\n )\n or\n rate(\n cortex_prometheus_rule_evaluations_total{\n cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\"\n }[$__rate_interval]\n )\n)\n", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval])) + sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval]))) + sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\"}[$__rate_interval])) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -17796,7 +17802,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -17872,7 +17884,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -17948,7 +17966,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -18024,7 +18048,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -22327,6 +22357,35 @@ data: "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json index 650a34a3642..8765cdbd175 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json @@ -91,7 +91,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(\n rate(\n cortex_request_duration_seconds_count{\n cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",\n route=~\"(prometheus|api_prom)_api_v1_query\"\n }[$__rate_interval]\n )\n or\n rate(\n cortex_prometheus_rule_evaluations_total{\n cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\"\n }[$__rate_interval]\n )\n)\n", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval])) + sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval]))) + sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\"}[$__rate_interval])) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -167,7 +173,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -243,7 +255,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -319,7 +337,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -395,7 +419,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -4698,6 +4728,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json index 25701c8b1e4..84f58445333 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json @@ -91,7 +91,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(\n rate(\n cortex_request_duration_seconds_count{\n cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",\n route=~\"(prometheus|api_prom)_api_v1_query\"\n }[$__rate_interval]\n )\n or\n rate(\n cortex_prometheus_rule_evaluations_total{\n cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\"\n }[$__rate_interval]\n )\n)\n", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval])) + sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval]))) + sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~\"$cluster\", job=~\"($namespace)/((ruler|cortex|mimir|mimir-backend.*))\"}[$__rate_interval])) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -167,7 +173,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -243,7 +255,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -319,7 +337,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -395,7 +419,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" @@ -4698,6 +4728,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index e57ea4b72a6..bbcac377231 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -127,11 +127,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; local queryPerSecond(name) = 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)%(route)s"}[$__rate_interval]))' % (variables { route: std.filter(function(r) r.name == name, overviewRoutes)[0].routeLabel }), - instantQueriesPerSecond: queryPerSecond('instantQuery'), - rangeQueriesPerSecond: queryPerSecond('rangeQuery'), - labelNamesQueriesPerSecond: queryPerSecond('labelNames'), - labelValuesQueriesPerSecond: queryPerSecond('labelValues'), - seriesQueriesPerSecond: queryPerSecond('series'), + local ncQueryPerSecond(name) = utils.ncHistogramSumBy(utils.ncHistogramCountRate(p.requestsPerSecondMetric, '%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)%(route)s"' % + (variables { route: std.filter(function(r) r.name == name, overviewRoutes)[0].routeLabel }))), + ncInstantQueriesPerSecond: ncQueryPerSecond('instantQuery'), + ncRangeQueriesPerSecond: ncQueryPerSecond('rangeQuery'), + ncLabelNamesQueriesPerSecond: ncQueryPerSecond('labelNames'), + ncLabelValuesQueriesPerSecond: ncQueryPerSecond('labelValues'), + ncSeriesQueriesPerSecond: ncQueryPerSecond('series'), remoteReadQueriesPerSecond: queryPerSecond('remoteRead'), metadataQueriesPerSecond: queryPerSecond('metadata'), exemplarsQueriesPerSecond: queryPerSecond('exemplars'), diff --git a/operations/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet index 305824146eb..7e4339dd796 100644 --- a/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/operations/mimir-mixin/dashboards/reads.libsonnet @@ -6,6 +6,7 @@ local filename = 'mimir-reads.json'; assert std.md5(filename) == 'e327503188913dc38ad571c647eef643' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Reads') + { uid: std.md5(filename) }) .addClusterSelectorTemplates() + .addShowNativeLatencyVariable() .addRowIf( $._config.show_dashboard_descriptions.reads, ($.row('Reads dashboard description') { height: '175px', showTitle: false }) @@ -38,26 +39,13 @@ local filename = 'mimir-reads.json'; showTitle: false, }) .addPanel( + local addRuleEvalRate(q) = { + local ruleEvalRate = ' + sum(rate(cortex_prometheus_rule_evaluations_total{' + $.jobMatcher($._config.job_names.ruler) + '}[$__rate_interval]))', + classic: q.classic + ruleEvalRate, + native: q.native + ruleEvalRate, + }; $.panel('Instant queries / sec') + - $.statPanel(||| - sum( - rate( - cortex_request_duration_seconds_count{ - %(queryFrontend)s, - route=~"(prometheus|api_prom)_api_v1_query" - }[$__rate_interval] - ) - or - rate( - cortex_prometheus_rule_evaluations_total{ - %(ruler)s - }[$__rate_interval] - ) - ) - ||| % { - queryFrontend: $.jobMatcher($._config.job_names.query_frontend), - ruler: $.jobMatcher($._config.job_names.ruler), - }, format='reqps') + + $.statPanel(addRuleEvalRate($.queries.query_frontend.ncInstantQueriesPerSecond), format='reqps') + $.panelDescription( 'Instant queries per second', ||| @@ -69,7 +57,7 @@ local filename = 'mimir-reads.json'; ) .addPanel( $.panel('Range queries / sec') + - $.statPanel($.queries.query_frontend.rangeQueriesPerSecond, format='reqps') + + $.statPanel($.queries.query_frontend.ncRangeQueriesPerSecond, format='reqps') + $.panelDescription( 'Range queries per second', ||| @@ -80,7 +68,7 @@ local filename = 'mimir-reads.json'; ) .addPanel( $.panel('Label names queries / sec') + - $.statPanel($.queries.query_frontend.labelNamesQueriesPerSecond, format='reqps') + + $.statPanel($.queries.query_frontend.ncLabelNamesQueriesPerSecond, format='reqps') + $.panelDescription( '"Label names" queries per second', ||| @@ -91,7 +79,7 @@ local filename = 'mimir-reads.json'; ) .addPanel( $.panel('Label values queries / sec') + - $.statPanel($.queries.query_frontend.labelValuesQueriesPerSecond, format='reqps') + + $.statPanel($.queries.query_frontend.ncLabelValuesQueriesPerSecond, format='reqps') + $.panelDescription( '"Label values" queries per second', ||| @@ -102,7 +90,7 @@ local filename = 'mimir-reads.json'; ) .addPanel( $.panel('Series queries / sec') + - $.statPanel($.queries.query_frontend.seriesQueriesPerSecond, format='reqps') + + $.statPanel($.queries.query_frontend.ncSeriesQueriesPerSecond, format='reqps') + $.panelDescription( 'Series queries per second', ||| diff --git a/operations/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json index a91499ef556..9329a39d951 100644 --- a/operations/mimir-mixin/jsonnetfile.lock.json +++ b/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "bf12954197422f36f0803ee217e378ad055f3837", - "sum": "EEPwMLfUIJT9iEUI/gCW9x6PxWoTBPSJOfabTF4rp1M=" + "version": "5cf8359e9f9d1dfc5f705fb36800031f99da7b8d", + "sum": "udZaafkbKYMGodLqsFhEe+Oy/St2p0edrK7hiMPEey0=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "bf12954197422f36f0803ee217e378ad055f3837", - "sum": "Qg992ZB0jkrS+YLq0Q7RV1fSHa8+hQT0jbpTyCGE2NI=" + "version": "5cf8359e9f9d1dfc5f705fb36800031f99da7b8d", + "sum": "mzLmCv9n3ldLChVGPfyRJOVKoBw+dfK40vU9792aHIM=" } ], "legacyImports": false From c750ae0ceca489dbc1c6e4737cea1918e7d532d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 17 Jul 2024 15:05:08 +0200 Subject: [PATCH 02/13] Update the Gateway row MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- operations/mimir-mixin/dashboards/reads.libsonnet | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/operations/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet index 7e4339dd796..6c96430699b 100644 --- a/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/operations/mimir-mixin/dashboards/reads.libsonnet @@ -105,16 +105,20 @@ local filename = 'mimir-reads.json'; $.row('Gateway') .addPanel( $.timeseriesPanel('Requests / sec') + - $.qpsPanel($.queries.gateway.readRequestsPerSecond) + $.qpsPanelNativeHistogram($.queries.gateway.requestsPerSecondMetric, $.queries.gateway.readRequestsPerSecondSelector) ) .addPanel( $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.read_http_routes_regex)]) + $.latencyRecordingRulePanelNativeHistogram($.queries.gateway.requestsPerSecondMetric, $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.read_http_routes_regex)]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway), $.queries.read_http_routes_regex], '' + [ + utils.showClassicHistogramQuery(utils.ncHistogramQuantile('0.99', $.queries.gateway.requestsPerSecondMetric, '%s, route=~"%s"' % [$.jobMatcher($._config.job_names.gateway), $.queries.read_http_routes_regex], [$._config.per_instance_label])), + utils.showNativeHistogramQuery(utils.ncHistogramQuantile('0.99', $.queries.gateway.requestsPerSecondMetric, '%s, route=~"%s"' % [$.jobMatcher($._config.job_names.gateway), $.queries.read_http_routes_regex], [$._config.per_instance_label])), + ], + ['', ''] ) ) ) From 4130fa350b15a8ae2057dcbf8be772d18c741340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 17 Jul 2024 15:40:57 +0200 Subject: [PATCH 03/13] Factor out common things about the per instance latency panel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../mimir-mixin/dashboards/dashboard-utils.libsonnet | 9 +++++++++ operations/mimir-mixin/dashboards/reads.libsonnet | 9 +++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 34f481ccd35..2dc14997f8d 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -258,6 +258,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], }, + perInstanceLatencyPanelNativeHistogram(quantile, metric, selector, instanceLabel=$._config.per_instance_label):: + $.hiddenLegendQueryPanel( + [ + utils.showClassicHistogramQuery(utils.ncHistogramQuantile(quantile, metric, utils.toPrometheusSelectorNaked(selector), [instanceLabel])), + utils.showNativeHistogramQuery(utils.ncHistogramQuantile(quantile, metric, utils.toPrometheusSelectorNaked(selector), [instanceLabel])), + ], + ['', ''] + ), + // Creates a panel like queryPanel() but if the legend contains only 1 entry, // than it configures the series alias color to the one used to display failures. failurePanel(queries, legends, legendLink=null):: diff --git a/operations/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet index 6c96430699b..8e97526a7ad 100644 --- a/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/operations/mimir-mixin/dashboards/reads.libsonnet @@ -113,12 +113,9 @@ local filename = 'mimir-reads.json'; ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - [ - utils.showClassicHistogramQuery(utils.ncHistogramQuantile('0.99', $.queries.gateway.requestsPerSecondMetric, '%s, route=~"%s"' % [$.jobMatcher($._config.job_names.gateway), $.queries.read_http_routes_regex], [$._config.per_instance_label])), - utils.showNativeHistogramQuery(utils.ncHistogramQuantile('0.99', $.queries.gateway.requestsPerSecondMetric, '%s, route=~"%s"' % [$.jobMatcher($._config.job_names.gateway), $.queries.read_http_routes_regex], [$._config.per_instance_label])), - ], - ['', ''] + $.perInstanceLatencyPanelNativeHistogram( + $.queries.gateway.requestsPerSecondMetric, + $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.read_http_routes_regex)], ) ) ) From cedb2bb342ffae21b6be2034593f9d17ab737db4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 17 Jul 2024 17:15:43 +0200 Subject: [PATCH 04/13] Update query frontend raws in Read and Remote ruler reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../metamonitoring/grafana-dashboards.yaml | 123 +++++++++++++++--- .../dashboards/mimir-reads.json | 47 +++++-- .../dashboards/mimir-remote-ruler-reads.json | 76 +++++++++-- .../dashboards/mimir-reads.json | 47 +++++-- .../dashboards/mimir-remote-ruler-reads.json | 76 +++++++++-- .../dashboards/dashboard-utils.libsonnet | 8 +- .../dashboards/remote-ruler-reads.libsonnet | 1 + 7 files changed, 325 insertions(+), 53 deletions(-) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index 7fe6344628a..bf1effc9c00 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -18283,7 +18283,13 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -18332,22 +18338,40 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})", + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" + }, + { + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -18394,7 +18418,14 @@ data: "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null @@ -24827,7 +24858,13 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -24876,22 +24913,40 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" + }, + { + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})", + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" + }, + { + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -24938,7 +24993,14 @@ data: "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null @@ -26151,6 +26213,35 @@ data: "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json index 8765cdbd175..6f5e28cc581 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json @@ -654,7 +654,13 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -703,22 +709,40 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" + }, + { + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})", + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" + }, + { + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -765,7 +789,14 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (instance) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json index c2fde8911aa..02361bf1c11 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json @@ -320,7 +320,13 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -369,22 +375,40 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" + }, + { + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -431,7 +455,14 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (instance) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null @@ -1644,6 +1675,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json index 84f58445333..3627af8c39f 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json @@ -654,7 +654,13 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -703,22 +709,40 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" + }, + { + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})", + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" + }, + { + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -765,7 +789,14 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json index ad4ce765103..ca01d0652a1 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json @@ -320,7 +320,13 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -369,22 +375,40 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" + }, + { + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -431,7 +455,14 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\", route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null @@ -1644,6 +1675,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index f0aebd1c1ee..5b721375137 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -1531,17 +1531,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row($.capitalize(rowTitlePrefix + 'query-frontend')) .addPanel( $.timeseriesPanel('Requests / sec') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher(queryFrontendJobName), queryRoutesRegex]) + $.qpsPanelNativeHistogram('cortex_request_duration_seconds', utils.toPrometheusSelectorNaked($.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)])) ) .addPanel( $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)]) + $.latencyRecordingRulePanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher(queryFrontendJobName), queryRoutesRegex], '' - ) + $.perInstanceLatencyPanelNativeHistogram('0.99', 'cortex_request_duration_seconds', $.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)]) ), local description = |||

diff --git a/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet b/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet index 9810193e7d3..e706808800c 100644 --- a/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet +++ b/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet @@ -10,6 +10,7 @@ local filename = 'mimir-remote-ruler-reads.json'; assert std.md5(filename) == 'f103238f7f5ab2f1345ce650cbfbfe2f' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Remote ruler reads') + { uid: std.md5(filename) }) .addClusterSelectorTemplates() + .addShowNativeLatencyVariable() .addRowIf( $._config.show_dashboard_descriptions.reads, ($.row('Remote ruler reads dashboard description') { height: '175px', showTitle: false }) From a4645fe683a1181e8822981eedb8efd5b6a19356 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jul 2024 08:37:29 +0200 Subject: [PATCH 05/13] Update ingester and store-gateway rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../metamonitoring/grafana-dashboards.yaml | 94 +++++++++++++++---- .../dashboards/mimir-reads.json | 94 +++++++++++++++---- .../dashboards/mimir-reads.json | 94 +++++++++++++++---- .../dashboards/dashboard-queries.libsonnet | 6 ++ .../mimir-mixin/dashboards/reads.libsonnet | 24 +++-- 5 files changed, 254 insertions(+), 58 deletions(-) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index bf1effc9c00..6edf72ab078 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -19542,7 +19542,13 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -19591,22 +19597,40 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" + }, + { + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -19653,7 +19677,14 @@ data: "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null @@ -19849,7 +19880,13 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\",route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\",route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\",route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -19898,22 +19935,40 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" + }, + { + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -19960,7 +20015,14 @@ data: "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json index 6f5e28cc581..fef9f52715d 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json @@ -1913,7 +1913,13 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -1962,22 +1968,40 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" + }, + { + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -2024,7 +2048,14 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (instance) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null @@ -2220,7 +2251,13 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\",route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\",route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\",route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -2269,22 +2306,40 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" + }, + { + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -2331,7 +2386,14 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (instance) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json index 3627af8c39f..23d414b96b0 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json @@ -1913,7 +1913,13 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -1962,22 +1968,40 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" + }, + { + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -2024,7 +2048,14 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null @@ -2220,7 +2251,13 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\",route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\",route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\",route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -2269,22 +2306,40 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" + }, + { + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Latency", @@ -2331,7 +2386,14 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]))) < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "", + "legendLink": null + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((store-gateway.*|cortex|mimir|mimir-backend.*))\", route=~\"/gatewaypb.StoreGateway/.*\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index bbcac377231..de82a79cf67 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -239,6 +239,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ingester: { + requestsPerSecondMetric: 'cortex_request_duration_seconds', + ingestOrClassicDeduplicatedQuery(perIngesterQuery, groupByLabels=''):: ||| ( # Classic storage sum by (%(groupByCluster)s, %(groupByLabels)s) (%(perIngesterQuery)s) @@ -264,5 +266,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; distributor: variables.distributorMatcher, }, }, + + store_gateway: { + requestsPerSecondMetric: 'cortex_request_duration_seconds', + }, }, } diff --git a/operations/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet index 8e97526a7ad..d079c12c415 100644 --- a/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/operations/mimir-mixin/dashboards/reads.libsonnet @@ -130,34 +130,38 @@ local filename = 'mimir-reads.json'; $.row('Ingester') .addPanel( $.timeseriesPanel('Requests / sec') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"%s"}' % [$.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex]) + $.qpsPanelNativeHistogram($.queries.ingester.requestsPerSecondMetric, '%s,route=~"%s"' % [$.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex]) ) .addPanel( $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', $._config.ingester_read_path_routes_regex)]) + $.latencyRecordingRulePanelNativeHistogram($.queries.ingester.requestsPerSecondMetric, $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', $._config.ingester_read_path_routes_regex)]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex], '' - ) + $.perInstanceLatencyPanelNativeHistogram( + '0.99', + $.queries.ingester.requestsPerSecondMetric, + $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', $._config.ingester_read_path_routes_regex)], + ), ) ) .addRow( $.row('Store-gateway') .addPanel( $.timeseriesPanel('Requests / sec') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) + $.qpsPanelNativeHistogram($.queries.store_gateway.requestsPerSecondMetric, '%s,route=~"/gatewaypb.StoreGateway/.*"' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) + $.latencyRecordingRulePanelNativeHistogram($.queries.store_gateway.requestsPerSecondMetric, $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' - ) + $.perInstanceLatencyPanelNativeHistogram( + '0.99', + $.queries.store_gateway.requestsPerSecondMetric, + $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')], + ), ) ) .addRowIf( From 09196554ba0a9d787243a3e13aae98ea40315c66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jul 2024 08:42:45 +0200 Subject: [PATCH 06/13] Update changelog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 1 + operations/helm/charts/mimir-distributed/CHANGELOG.md | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b8cd87607de..9410b885128 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,7 @@ * [ENHANCEMENT] Dashboards: allow switching between using classic or native histograms in dashboards. #7674 #8502 * Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric. + * Reads dashboard: `cortex_request_duration_seconds` metric. #8752 * [ENHANCEMENT] Alerts: `MimirRunningIngesterReceiveDelayTooHigh` alert has been tuned to be more reactive to high receive delay. #8538 * [ENHANCEMENT] Dashboards: improve end-to-end latency and strong read consistency panels when experimental ingest storage is enabled. #8543 * [ENHANCEMENT] Dashboards: Add panels for monitoring ingester autoscaling when not using ingest-storage. These panels are disabled by default, but can be enabled using the `autoscaling.ingester.enabled: true` config option. #8484 diff --git a/operations/helm/charts/mimir-distributed/CHANGELOG.md b/operations/helm/charts/mimir-distributed/CHANGELOG.md index aa6124728f6..e0cf4439bf8 100644 --- a/operations/helm/charts/mimir-distributed/CHANGELOG.md +++ b/operations/helm/charts/mimir-distributed/CHANGELOG.md @@ -31,6 +31,7 @@ Entries should include a reference to the Pull Request that introduced the chang * [ENHANCEMENT] Dashboards: allow switching between using classic or native histograms in dashboards. #7674 * Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric. + * Reads dashboard: `cortex_request_duration_seconds` metric. #8752 * [ENHANCEMENT] Memcached: Update to Memcached 1.6.28 and memcached-exporter 0.14.4. #8557 * [ENHANCEMENT] Add missing fields in multiple topology spread constraints. #8533 * [ENHANCEMENT] Add support for setting the image pull secrets, node selectors, tolerations and topology spread constraints for the Grafana Agent pods used for metamonitoring. #8670 From bb28b337eaa5fab7b0f6d6dc2f9be7aa16494df2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jul 2024 08:56:18 +0200 Subject: [PATCH 07/13] Factor out ingester reads selector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- operations/mimir-mixin/dashboards/dashboard-queries.libsonnet | 4 ++++ operations/mimir-mixin/dashboards/reads.libsonnet | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index de82a79cf67..3257886d5e0 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -33,6 +33,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; local variables = { gatewayMatcher: $.jobMatcher($._config.job_names.gateway), distributorMatcher: $.jobMatcher($._config.job_names.distributor), + ingesterMatcher: $.jobMatcher($._config.job_names.ingester), queryFrontendMatcher: $.jobMatcher($._config.job_names.query_frontend), rulerMatcher: $.jobMatcher($._config.job_names.ruler), alertmanagerMatcher: $.jobMatcher($._config.job_names.alertmanager), @@ -40,6 +41,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; writeHTTPRoutesRegex: $.queries.write_http_routes_regex, writeGRPCRoutesRegex: $.queries.write_grpc_routes_regex, readHTTPRoutesRegex: $.queries.read_http_routes_regex, + readGRPCIngesterRoute: $.queries.read_grpc_ingester_route, perClusterLabel: $._config.per_cluster_label, recordingRulePrefix: $.recordingRulePrefix($.jobSelector('any')), // The job name does not matter here. groupPrefixJobs: $._config.group_prefix_jobs, @@ -48,6 +50,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; write_http_routes_regex: 'api_(v1|prom)_push|otlp_v1_metrics', write_grpc_routes_regex: '/distributor.Distributor/Push|/httpgrpc.*', + read_grpc_ingester_route: '/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)', read_http_routes_regex: '(prometheus|api_prom)_api_v1_.+', query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', @@ -240,6 +243,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ingester: { requestsPerSecondMetric: 'cortex_request_duration_seconds', + readRequestsPerSecondSelector: '%(ingesterMatcher)s,route=~"%(readGRPCIngesterRoute)s"' % variables, ingestOrClassicDeduplicatedQuery(perIngesterQuery, groupByLabels=''):: ||| ( # Classic storage diff --git a/operations/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet index d079c12c415..0d57ba83957 100644 --- a/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/operations/mimir-mixin/dashboards/reads.libsonnet @@ -130,7 +130,7 @@ local filename = 'mimir-reads.json'; $.row('Ingester') .addPanel( $.timeseriesPanel('Requests / sec') + - $.qpsPanelNativeHistogram($.queries.ingester.requestsPerSecondMetric, '%s,route=~"%s"' % [$.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex]) + $.qpsPanelNativeHistogram($.queries.ingester.requestsPerSecondMetric, $.queries.ingester.readRequestsPerSecondSelector) ) .addPanel( $.timeseriesPanel('Latency') + From 3f68c8a5485153b43a7cd6282bd1e637dc4faf6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jul 2024 09:00:37 +0200 Subject: [PATCH 08/13] Factor out store-gateway reads selector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- operations/mimir-mixin/dashboards/dashboard-queries.libsonnet | 4 ++++ operations/mimir-mixin/dashboards/reads.libsonnet | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 3257886d5e0..d5c9fa9445a 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -38,10 +38,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; rulerMatcher: $.jobMatcher($._config.job_names.ruler), alertmanagerMatcher: $.jobMatcher($._config.job_names.alertmanager), namespaceMatcher: $.namespaceMatcher(), + storeGatewayMatcher: $.jobMatcher($._config.job_names.store_gateway), writeHTTPRoutesRegex: $.queries.write_http_routes_regex, writeGRPCRoutesRegex: $.queries.write_grpc_routes_regex, readHTTPRoutesRegex: $.queries.read_http_routes_regex, readGRPCIngesterRoute: $.queries.read_grpc_ingester_route, + readGRPCStoreGatewayRoute: $.queries.read_grpc_store_gateway_route, perClusterLabel: $._config.per_cluster_label, recordingRulePrefix: $.recordingRulePrefix($.jobSelector('any')), // The job name does not matter here. groupPrefixJobs: $._config.group_prefix_jobs, @@ -51,6 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; write_http_routes_regex: 'api_(v1|prom)_push|otlp_v1_metrics', write_grpc_routes_regex: '/distributor.Distributor/Push|/httpgrpc.*', read_grpc_ingester_route: '/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)', + read_grpc_store_gateway_route: '/gatewaypb.StoreGateway/.*', read_http_routes_regex: '(prometheus|api_prom)_api_v1_.+', query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', @@ -273,6 +276,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; store_gateway: { requestsPerSecondMetric: 'cortex_request_duration_seconds', + readRequestsPerSecondSelector: '%(storeGatewayMatcher)s,route=~"%(readGRPCStoreGatewayRoute)s"' % variables, }, }, } diff --git a/operations/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet index 0d57ba83957..c0f669e23cc 100644 --- a/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/operations/mimir-mixin/dashboards/reads.libsonnet @@ -149,7 +149,7 @@ local filename = 'mimir-reads.json'; $.row('Store-gateway') .addPanel( $.timeseriesPanel('Requests / sec') + - $.qpsPanelNativeHistogram($.queries.store_gateway.requestsPerSecondMetric, '%s,route=~"/gatewaypb.StoreGateway/.*"' % $.jobMatcher($._config.job_names.store_gateway)) + $.qpsPanelNativeHistogram($.queries.store_gateway.requestsPerSecondMetric, $.queries.store_gateway.readRequestsPerSecondSelector) ) .addPanel( $.timeseriesPanel('Latency') + From f1e7f35e98d890f0dba596772fd1fdfac518356d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jul 2024 11:55:35 +0200 Subject: [PATCH 09/13] Fix missing quantile spec in gateway panel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- operations/mimir-mixin/dashboards/dashboard-queries.libsonnet | 3 --- operations/mimir-mixin/dashboards/reads.libsonnet | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index db8a3d1174e..199eabb5ccd 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -87,9 +87,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, query_frontend: { - // deprecated, will be removed - readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, - local p = self, requestsPerSecondMetric: 'cortex_request_duration_seconds', readRequestsPerSecondSelector: '%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, diff --git a/operations/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet index c0f669e23cc..5a4883f445e 100644 --- a/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/operations/mimir-mixin/dashboards/reads.libsonnet @@ -114,6 +114,7 @@ local filename = 'mimir-reads.json'; .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + $.perInstanceLatencyPanelNativeHistogram( + '0.99', $.queries.gateway.requestsPerSecondMetric, $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.read_http_routes_regex)], ) From 321cd30525b8de044a88e8b149f40654d009b1c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jul 2024 12:00:49 +0200 Subject: [PATCH 10/13] Rerun check-mixin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- operations/mimir-mixin/dashboards/dashboard-queries.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 199eabb5ccd..d59b1979a5c 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -43,7 +43,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; writeGRPCDistributorRoutesRegex: $.queries.write_grpc_distributor_routes_regex, writeGRPCIngesterRoute: $.queries.write_grpc_ingester_route, readHTTPRoutesRegex: $.queries.read_http_routes_regex, - readGRPCIngesterRoute: $.queries.read_grpc_ingester_route, + readGRPCIngesterRoute: $.queries.read_grpc_ingester_route, readGRPCStoreGatewayRoute: $.queries.read_grpc_store_gateway_route, perClusterLabel: $._config.per_cluster_label, recordingRulePrefix: $.recordingRulePrefix($.jobSelector('any')), // The job name does not matter here. From 4575a7522218c55f5b3bb62e5c41b8d1afb8b99a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jul 2024 13:50:40 +0200 Subject: [PATCH 11/13] Replace literal metric name with const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- operations/mimir-mixin/dashboards/dashboard-utils.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 8f3ef7091df..e6d6f404b65 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -1533,15 +1533,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row($.capitalize(rowTitlePrefix + 'query-frontend')) .addPanel( $.timeseriesPanel('Requests / sec') + - $.qpsPanelNativeHistogram('cortex_request_duration_seconds', utils.toPrometheusSelectorNaked($.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)])) + $.qpsPanelNativeHistogram($.queries.query_frontend.requestsPerSecondMetric, utils.toPrometheusSelectorNaked($.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)])) ) .addPanel( $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)]) + $.latencyRecordingRulePanelNativeHistogram($.queries.query_frontend.requestsPerSecondMetric, $.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.perInstanceLatencyPanelNativeHistogram('0.99', 'cortex_request_duration_seconds', $.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)]) + $.perInstanceLatencyPanelNativeHistogram('0.99', $.queries.query_frontend.requestsPerSecondMetric, $.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)]) ), local description = |||

From bb8a8c0bcb453b5f728430a867218fe5b0d54917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jul 2024 13:50:58 +0200 Subject: [PATCH 12/13] Do not redefine ingester read path routes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- operations/mimir-mixin/dashboards/dashboard-queries.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index d59b1979a5c..df4e46f1c3a 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -56,7 +56,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; write_grpc_distributor_routes_regex: '/distributor.Distributor/Push|/httpgrpc.*', write_grpc_ingester_route: '/cortex.Ingester/Push', read_http_routes_regex: '(prometheus|api_prom)_api_v1_.+', - read_grpc_ingester_route: '/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)', + read_grpc_ingester_route: $._config.ingester_read_path_routes_regex, read_grpc_store_gateway_route: '/gatewaypb.StoreGateway/.*', query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', From a93ba4e75773538ac5d1da43e68d75dec17f4b4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jul 2024 13:59:43 +0200 Subject: [PATCH 13/13] Define store gateway read path route regex like for ingester MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- operations/mimir-mixin/config.libsonnet | 3 +++ operations/mimir-mixin/dashboards/dashboard-queries.libsonnet | 2 +- operations/mimir-mixin/dashboards/reads.libsonnet | 4 ++-- pkg/storegateway/storegatewaypb/gateway.proto | 2 ++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/operations/mimir-mixin/config.libsonnet b/operations/mimir-mixin/config.libsonnet index 31e13699ee5..e06fc25bb2c 100644 --- a/operations/mimir-mixin/config.libsonnet +++ b/operations/mimir-mixin/config.libsonnet @@ -657,6 +657,9 @@ // All query methods from IngesterServer interface. Basically everything except Push. ingester_read_path_routes_regex: '/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)', + // All query methods from StoregatewayServer interface. + store_gateway_read_path_routes_regex: '/gatewaypb.StoreGateway/.*', + // The default datasource used for dashboards. dashboard_datasource: 'default', datasource_regex: '', diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index df4e46f1c3a..3e1ebafc239 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -57,7 +57,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; write_grpc_ingester_route: '/cortex.Ingester/Push', read_http_routes_regex: '(prometheus|api_prom)_api_v1_.+', read_grpc_ingester_route: $._config.ingester_read_path_routes_regex, - read_grpc_store_gateway_route: '/gatewaypb.StoreGateway/.*', + read_grpc_store_gateway_route: $._config.store_gateway_read_path_routes_regex, query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', gateway: { diff --git a/operations/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet index 5a4883f445e..b079f2f2ee0 100644 --- a/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/operations/mimir-mixin/dashboards/reads.libsonnet @@ -154,14 +154,14 @@ local filename = 'mimir-reads.json'; ) .addPanel( $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanelNativeHistogram($.queries.store_gateway.requestsPerSecondMetric, $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) + $.latencyRecordingRulePanelNativeHistogram($.queries.store_gateway.requestsPerSecondMetric, $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', $._config.store_gateway_read_path_routes_regex)]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + $.perInstanceLatencyPanelNativeHistogram( '0.99', $.queries.store_gateway.requestsPerSecondMetric, - $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')], + $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', $._config.store_gateway_read_path_routes_regex)], ), ) ) diff --git a/pkg/storegateway/storegatewaypb/gateway.proto b/pkg/storegateway/storegatewaypb/gateway.proto index ee5d0ca58ea..9d2c3ff5803 100644 --- a/pkg/storegateway/storegatewaypb/gateway.proto +++ b/pkg/storegateway/storegatewaypb/gateway.proto @@ -25,4 +25,6 @@ service StoreGateway { // LabelValues returns all label values for given label name. rpc LabelValues(thanos.LabelValuesRequest) returns (thanos.LabelValuesResponse); + + // When adding more read-path methods here, please update store_gateway_read_path_routes_regex in operations/mimir-mixin/config.libsonnet as well as needed. }