From 91b235ebd561c3d32b63bcff8c14445b8527f49a Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 3 Feb 2022 10:14:44 +0100 Subject: [PATCH] Added querier autoscaling panels and alerts to mixin (#1006) * Added querier autoscaling panels and alerts to mixin Signed-off-by: Marco Pracucci * Fixed typos Signed-off-by: Marco Pracucci * Addressed review comments Signed-off-by: Marco Pracucci --- CHANGELOG.md | 1 + Makefile | 2 +- operations/mimir-mixin-compiled/alerts.yaml | 18 +- .../dashboards/mimir-reads.json | 361 ++++++++++++++++-- operations/mimir-mixin/alerts.libsonnet | 1 + .../mimir-mixin/alerts/alerts.libsonnet | 2 +- .../mimir-mixin/alerts/autoscaling.libsonnet | 27 ++ .../mimir-mixin/alerts/blocks.libsonnet | 2 +- .../mimir-mixin/alerts/compactor.libsonnet | 2 +- operations/mimir-mixin/config.libsonnet | 6 + .../dashboards/dashboard-utils.libsonnet | 33 ++ .../mimir-mixin/dashboards/reads.libsonnet | 62 +++ operations/mimir-mixin/docs/playbooks.md | 27 ++ .../mimir-mixin/mixin-compiled.libsonnet | 9 + 14 files changed, 510 insertions(+), 43 deletions(-) create mode 100644 operations/mimir-mixin/alerts/autoscaling.libsonnet create mode 100644 operations/mimir-mixin/mixin-compiled.libsonnet diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d513085d8d..02d1e84ee88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -468,6 +468,7 @@ * [CHANGE] Enabled resources dashboards by default. Can be disabled setting `resources_dashboards_enabled` config field to `false`. #920 * [FEATURE] Added `Cortex / Overrides` dashboard, displaying default limits and per-tenant overrides applied to Mimir. #673 * [FEATURE] Added `Mimir / Tenants` and `Mimir / Top tenants` dashboards, displaying user-based metrics. #776 +* [FEATURE] Added querier autoscaling panels and alerts. #1006 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. [#317](https://github.com/grafana/cortex-jsonnet/pull/317) * [ENHANCEMENT] Cortex-mixin: Include `cortex-gw-internal` naming variation in default `gateway` job names. [#328](https://github.com/grafana/cortex-jsonnet/pull/328) * [ENHANCEMENT] Ruler dashboard: added object storage metrics. [#354](https://github.com/grafana/cortex-jsonnet/pull/354) diff --git a/Makefile b/Makefile index 0a0cc185136..d34d169f1b4 100644 --- a/Makefile +++ b/Makefile @@ -437,7 +437,7 @@ check-mixin-playbook: build-mixin build-mixin: check-mixin-jb @rm -rf $(MIXIN_OUT_PATH) && mkdir $(MIXIN_OUT_PATH) - @mixtool generate all --output-alerts $(MIXIN_OUT_PATH)/alerts.yaml --output-rules $(MIXIN_OUT_PATH)/rules.yaml --directory $(MIXIN_OUT_PATH)/dashboards ${MIXIN_PATH}/mixin.libsonnet + @mixtool generate all --output-alerts $(MIXIN_OUT_PATH)/alerts.yaml --output-rules $(MIXIN_OUT_PATH)/rules.yaml --directory $(MIXIN_OUT_PATH)/dashboards ${MIXIN_PATH}/mixin-compiled.libsonnet @cd $(MIXIN_OUT_PATH)/.. && zip -q -r mimir-mixin.zip $$(basename "$(MIXIN_OUT_PATH)") @echo "The mixin has been compiled to $(MIXIN_OUT_PATH) and archived to $$(realpath --relative-to=$$(pwd) $(MIXIN_OUT_PATH)/../mimir-mixin.zip)" diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 3322fdb94e2..197d43e6745 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -122,7 +122,7 @@ groups: for: 5m labels: severity: critical -- name: cortex_instance_limits_alerts +- name: mimir_instance_limits_alerts rules: - alert: MimirIngesterReachingSeriesLimit annotations: @@ -436,7 +436,7 @@ groups: increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 labels: severity: critical -- name: cortex_blocks_alerts +- name: mimir_blocks_alerts rules: - alert: MimirIngesterHasNotShippedBlocks annotations: @@ -580,7 +580,7 @@ groups: for: 6h labels: severity: warning -- name: cortex_compactor_alerts +- name: mimir_compactor_alerts rules: - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks annotations: @@ -641,3 +641,15 @@ groups: for: 1m labels: severity: warning +- name: mimir_autoscaling_querier + rules: + - alert: MimirQuerierAutoscalerNotActive + annotations: + message: The Horizontal Pod Autoscaler (HPA) in {{ $labels.namespace }} is not active. + expr: | + kube_horizontalpodautoscaler_status_condition{horizontalpodautoscaler="keda-hpa-querier",condition="ScalingActive",status="false"} + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + > 0 + for: 15m + labels: + severity: warning diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json index 67478633ca3..d03a828b77c 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json @@ -1464,6 +1464,295 @@ "title": "Querier", "titleSize": "h6" }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "### Replicas\nThe minimum, maximum, and current number of querier replicas.\n\n", + "fill": 1, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "kube_horizontalpodautoscaler_spec_min_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=\"keda-hpa-querier\"}", + "format": "time_series", + "interval": "15s", + "intervalFactor": 2, + "legendFormat": "Min", + "legendLink": null, + "step": 10 + }, + { + "expr": "kube_horizontalpodautoscaler_spec_max_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=\"keda-hpa-querier\"}", + "format": "time_series", + "interval": "15s", + "intervalFactor": 2, + "legendFormat": "Max", + "legendLink": null, + "step": 10 + }, + { + "expr": "kube_horizontalpodautoscaler_status_current_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=\"keda-hpa-querier\"}", + "format": "time_series", + "interval": "15s", + "intervalFactor": 2, + "legendFormat": "Current", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "### Scaling metric\nThis panel shows the result of the query used as scaling metric and target/threshold used.\nThe desired number of replicas is computed by HPA as: / .\n\n", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Target per replica" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + } + ] + } + ] + }, + "fill": 1, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "keda_metrics_adapter_scaler_metrics_value +\non(metric) group_left\nlabel_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=\"keda-hpa-querier\"}\n * 0, \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "format": "time_series", + "interval": "15s", + "intervalFactor": 2, + "legendFormat": "Scaling metric", + "legendLink": null, + "step": 10 + }, + { + "expr": "kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=\"keda-hpa-querier\"}", + "format": "time_series", + "interval": "15s", + "intervalFactor": 2, + "legendFormat": "Target per replica", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Scaling metric", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "fill": 1, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(keda_metrics_adapter_scaler_errors[$__rate_interval])) +\non(metric) group_left\nlabel_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=\"keda-hpa-querier\"}\n * 0, \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "format": "time_series", + "interval": "15s", + "intervalFactor": 2, + "legendFormat": "Failures per second", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Autoscaler failures rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Querier - autoscaling", + "titleSize": "h6" + }, { "collapse": false, "height": "250px", @@ -1483,7 +1772,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 19, + "id": 22, "legend": { "avg": false, "current": false, @@ -1560,7 +1849,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 20, + "id": 23, "legend": { "avg": false, "current": false, @@ -1652,7 +1941,7 @@ "dashes": false, "datasource": "$datasource", "fill": 0, - "id": 21, + "id": 24, "legend": { "show": false }, @@ -1741,7 +2030,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 22, + "id": 25, "legend": { "avg": false, "current": false, @@ -1818,7 +2107,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 23, + "id": 26, "legend": { "avg": false, "current": false, @@ -1910,7 +2199,7 @@ "dashes": false, "datasource": "$datasource", "fill": 0, - "id": 24, + "id": 27, "legend": { "show": false }, @@ -1999,7 +2288,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 25, + "id": 28, "legend": { "avg": false, "current": false, @@ -2076,7 +2365,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 26, + "id": 29, "legend": { "avg": false, "current": false, @@ -2183,7 +2472,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 27, + "id": 30, "legend": { "avg": false, "current": false, @@ -2260,7 +2549,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 28, + "id": 31, "legend": { "avg": false, "current": false, @@ -2356,7 +2645,7 @@ "datasource": "$datasource", "description": "### Hit ratio\nEven if you do not set up memcached for the blocks index cache, you will still see data in this panel because the store-gateway by default has an\nin-memory blocks index cache.\n\n", "fill": 1, - "id": 29, + "id": 32, "legend": { "avg": false, "current": false, @@ -2445,7 +2734,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 30, + "id": 33, "legend": { "avg": false, "current": false, @@ -2522,7 +2811,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 31, + "id": 34, "legend": { "avg": false, "current": false, @@ -2617,7 +2906,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 32, + "id": 35, "legend": { "avg": false, "current": false, @@ -2706,7 +2995,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 33, + "id": 36, "legend": { "avg": false, "current": false, @@ -2783,7 +3072,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 34, + "id": 37, "legend": { "avg": false, "current": false, @@ -2878,7 +3167,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 35, + "id": 38, "legend": { "avg": false, "current": false, @@ -2967,7 +3256,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 36, + "id": 39, "legend": { "avg": false, "current": false, @@ -3044,7 +3333,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 37, + "id": 40, "legend": { "avg": false, "current": false, @@ -3139,7 +3428,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 38, + "id": 41, "legend": { "avg": false, "current": false, @@ -3228,7 +3517,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 39, + "id": 42, "legend": { "avg": false, "current": false, @@ -3305,7 +3594,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 40, + "id": 43, "legend": { "avg": false, "current": false, @@ -3382,7 +3671,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 41, + "id": 44, "legend": { "avg": false, "current": false, @@ -3477,7 +3766,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 42, + "id": 45, "legend": { "avg": false, "current": false, @@ -3584,7 +3873,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 43, + "id": 46, "legend": { "avg": false, "current": false, @@ -3679,7 +3968,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 44, + "id": 47, "legend": { "avg": false, "current": false, @@ -3774,7 +4063,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 45, + "id": 48, "legend": { "avg": false, "current": false, @@ -3869,7 +4158,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 46, + "id": 49, "legend": { "avg": false, "current": false, @@ -3976,7 +4265,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 47, + "id": 50, "legend": { "avg": false, "current": false, @@ -4053,7 +4342,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 48, + "id": 51, "legend": { "avg": false, "current": false, @@ -4130,7 +4419,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 49, + "id": 52, "legend": { "avg": false, "current": false, @@ -4225,7 +4514,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 50, + "id": 53, "legend": { "avg": false, "current": false, @@ -4332,7 +4621,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 51, + "id": 54, "legend": { "avg": false, "current": false, @@ -4427,7 +4716,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 52, + "id": 55, "legend": { "avg": false, "current": false, @@ -4522,7 +4811,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 53, + "id": 56, "legend": { "avg": false, "current": false, @@ -4617,7 +4906,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 54, + "id": 57, "legend": { "avg": false, "current": false, diff --git a/operations/mimir-mixin/alerts.libsonnet b/operations/mimir-mixin/alerts.libsonnet index dbab11a805f..6a91cf83725 100644 --- a/operations/mimir-mixin/alerts.libsonnet +++ b/operations/mimir-mixin/alerts.libsonnet @@ -4,5 +4,6 @@ (import 'alerts/alertmanager.libsonnet') + (import 'alerts/blocks.libsonnet') + (import 'alerts/compactor.libsonnet') + + (import 'alerts/autoscaling.libsonnet') + { _config:: $._config + $._group_config }, } diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index fcb9d67f04e..68883da26b5 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -224,7 +224,7 @@ ], }, { - name: 'cortex_instance_limits_alerts', + name: 'mimir_instance_limits_alerts', rules: [ { alert: $.alertName('IngesterReachingSeriesLimit'), diff --git a/operations/mimir-mixin/alerts/autoscaling.libsonnet b/operations/mimir-mixin/alerts/autoscaling.libsonnet new file mode 100644 index 00000000000..836f58accb5 --- /dev/null +++ b/operations/mimir-mixin/alerts/autoscaling.libsonnet @@ -0,0 +1,27 @@ +(import 'alerts-utils.libsonnet') { + groups+: if !$._config.autoscaling.querier_enabled then [] else [ + { + name: 'mimir_autoscaling_querier', + rules: [ + { + alert: $.alertName('QuerierAutoscalerNotActive'), + 'for': '15m', + expr: ||| + kube_horizontalpodautoscaler_status_condition{horizontalpodautoscaler="%(hpa_name)s",condition="ScalingActive",status="false"} + * on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info) + > 0 + ||| % { + aggregation_labels: $._config.alert_aggregation_labels, + hpa_name: $._config.autoscaling.querier_hpa_name, + }, + labels: { + severity: 'warning', + }, + annotations: { + message: 'The Horizontal Pod Autoscaler (HPA) in {{ $labels.namespace }} is not active.' % $._config, + }, + }, + ], + }, + ], +} diff --git a/operations/mimir-mixin/alerts/blocks.libsonnet b/operations/mimir-mixin/alerts/blocks.libsonnet index 1bfe2b7e32d..98d4f2c4c69 100644 --- a/operations/mimir-mixin/alerts/blocks.libsonnet +++ b/operations/mimir-mixin/alerts/blocks.libsonnet @@ -1,7 +1,7 @@ (import 'alerts-utils.libsonnet') { groups+: [ { - name: 'cortex_blocks_alerts', + name: 'mimir_blocks_alerts', rules: [ { // Alert if the ingester has not shipped any block in the last 4h. It also checks cortex_ingester_ingested_samples_total diff --git a/operations/mimir-mixin/alerts/compactor.libsonnet b/operations/mimir-mixin/alerts/compactor.libsonnet index 6ca2f0a1985..7eda7984e1b 100644 --- a/operations/mimir-mixin/alerts/compactor.libsonnet +++ b/operations/mimir-mixin/alerts/compactor.libsonnet @@ -1,7 +1,7 @@ (import 'alerts-utils.libsonnet') { groups+: [ { - name: 'cortex_compactor_alerts', + name: 'mimir_compactor_alerts', rules: [ { // Alert if the compactor has not successfully cleaned up blocks in the last 6h. diff --git a/operations/mimir-mixin/config.libsonnet b/operations/mimir-mixin/config.libsonnet index ca218465be2..7a578066e41 100644 --- a/operations/mimir-mixin/config.libsonnet +++ b/operations/mimir-mixin/config.libsonnet @@ -67,6 +67,12 @@ top_tenants: true, }, + // Whether autoscaling panels and alerts should be enabled for specific Mimir services. + autoscaling: { + querier_enabled: false, + querier_hpa_name: 'keda-hpa-querier', + }, + // The routes to exclude from alerts. alert_excluded_routes: [], }, diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 8f60b8e2ab6..f2fbf9f774d 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -561,6 +561,39 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerName, ], + filterKedaMetricByHPA(query, hpa_name):: + ||| + %(query)s + + on(metric) group_left + label_replace( + kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler="%(hpa_name)s"} + * 0, "metric", "$1", "metric_name", "(.+)" + ) + ||| % { + query: query, + hpa_name: hpa_name, + namespace: $.namespaceMatcher(), + }, + + panelAxisPlacement(seriesName, placement):: { + fieldConfig+: { + overrides+: [ + { + matcher: { + id: 'byName', + options: seriesName, + }, + properties: [ + { + id: 'custom.axisPlacement', + value: placement, + }, + ], + }, + ], + }, + }, + panelDescription(title, description):: { description: ||| ### %s diff --git a/operations/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet index d24a6a1ad51..e5ca2b78f5a 100644 --- a/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/operations/mimir-mixin/dashboards/reads.libsonnet @@ -198,6 +198,68 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('s') } ) ) + .addRowIf( + $._config.autoscaling.querier_enabled, + $.row('Querier - autoscaling') + .addPanel( + local title = 'Replicas'; + $.panel(title) + + $.queryPanel( + [ + 'kube_horizontalpodautoscaler_spec_min_replicas{%s, horizontalpodautoscaler="%s"}' % [$.namespaceMatcher(), $._config.autoscaling.querier_hpa_name], + 'kube_horizontalpodautoscaler_spec_max_replicas{%s, horizontalpodautoscaler="%s"}' % [$.namespaceMatcher(), $._config.autoscaling.querier_hpa_name], + 'kube_horizontalpodautoscaler_status_current_replicas{%s, horizontalpodautoscaler="%s"}' % [$.namespaceMatcher(), $._config.autoscaling.querier_hpa_name], + ], + [ + 'Min', + 'Max', + 'Current', + ], + ) + + $.panelDescription( + title, + ||| + The minimum, maximum, and current number of querier replicas. + ||| + ), + ) + .addPanel( + local title = 'Scaling metric'; + $.panel(title) + + $.queryPanel( + [ + $.filterKedaMetricByHPA('keda_metrics_adapter_scaler_metrics_value', $._config.autoscaling.querier_hpa_name), + 'kube_horizontalpodautoscaler_spec_target_metric{%s, horizontalpodautoscaler="%s"}' % [$.namespaceMatcher(), $._config.autoscaling.querier_hpa_name], + ], [ + 'Scaling metric', + 'Target per replica', + ] + ) + + $.panelDescription( + title, + ||| + This panel shows the result of the query used as scaling metric and target/threshold used. + The desired number of replicas is computed by HPA as: / . + ||| + ) + + $.panelAxisPlacement('Target per replica', 'right'), + ) + .addPanel( + local title = 'Autoscaler failures rate'; + $.panel(title) + + $.queryPanel( + $.filterKedaMetricByHPA('sum(rate(keda_metrics_adapter_scaler_errors[$__rate_interval]))', $._config.autoscaling.querier_hpa_name), + 'Failures per second' + ) + + $.panelDescription( + title, + ||| + The rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom + metrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly. + ||| + ), + ) + ) .addRow( $.row('Ingester') .addPanel( diff --git a/operations/mimir-mixin/docs/playbooks.md b/operations/mimir-mixin/docs/playbooks.md index f31048a0c70..8262edfd52c 100644 --- a/operations/mimir-mixin/docs/playbooks.md +++ b/operations/mimir-mixin/docs/playbooks.md @@ -832,6 +832,33 @@ How to **investigate**: ``` - In case you need to quickly reject write path traffic from a single tenant, you can override its `ingestion_rate` and `ingestion_rate_burst` setting lower values (so that some/most of their traffic will be rejected) +### MimirQuerierAutoscalerNotActive + +This alert fires when the Mimir querier Kubernetes Horizontal Pod Autoscaler's (HPA) `ScalingActive` condition is `false`. When this happens, it's not able to calculate desired scale and generally indicates problems with fetching metrics. + +How it **works**: + +- HPA is configured to autoscale Mimir queriers based on custom metrics fetched from Prometheus via the KEDA custom metrics API server +- HPA periodically queries updated metrics and updates the number of desired replicas based on that +- Please refer to the [HPA documentation](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) for more information about it + +How to **investigate**: + +- Check HPA conditions and events to get more details about the failure + ``` + kubectl describe hpa -n keda-hpa-querier + ``` +- Ensure KEDA custom metrics API server is up and running + ``` + # Assuming KEDA is running in a dedicated namespace "keda": + kubectl get pods -n keda + ``` +- Check KEDA custom metrics API server logs + ``` + # Assuming KEDA is running in a dedicated namespace "keda": + kubectl logs -n keda deployment/keda-operator-metrics-apiserver + ``` + ## Mimir routes by path **Write path**: diff --git a/operations/mimir-mixin/mixin-compiled.libsonnet b/operations/mimir-mixin/mixin-compiled.libsonnet new file mode 100644 index 00000000000..914dc6f3a73 --- /dev/null +++ b/operations/mimir-mixin/mixin-compiled.libsonnet @@ -0,0 +1,9 @@ +(import 'mixin.libsonnet') + { + // Config overrides used when building the compiled version of the mimir-mixin. + // This includes all features, since the compiled version can't be customized. + _config+:: { + autoscaling+: { + querier_enabled: true, + }, + }, +}