This repository has been archived by the owner on Oct 8, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
OPENG-2066: Verify and double-check prometheus alert rules (#104)
- Loading branch information
1 parent
228848b
commit 6b7d8b7
Showing
2 changed files
with
187 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,187 @@ | ||
groups: | ||
- name: tempo_alerts | ||
rules: | ||
- alert: "TempoBlockListRisingQuickly" | ||
expr: | | ||
avg by (job, instance)(tempodb_blocklist_length) / avg by (job, instance)(tempodb_blocklist_length offset 7d) > 1.4 | ||
for: "15m" | ||
labels: | ||
severity: "critical" | ||
annotations: | ||
summary: "Tempo block list rising quickly (instance {{ $labels.instance }})" | ||
description: "The {{ $labels.job }} is experiencing a 40% rise in tempo blocklist length over the last 7 days. Consider scaling compactors." | ||
- alert: TempoCompactionsFailing | ||
expr: sum by (job, instance)(increase(tempodb_compaction_errors_total{}[1h])) > 2 and sum by (job, instance)(increase(tempodb_compaction_errors_total{}[5m])) > 0 | ||
for: 5m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo compactions failing (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is experiencing more than 2 compactions failures in the past hour." | ||
- alert: TempoCompactorUnhealthy | ||
expr: max by (job, instance)(tempo_ring_members{state="Unhealthy", name="compactor"}) > 0 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo unhealthy compactor(s) (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy compactor(s)." | ||
- alert: TempoCompactorsTooManyOutstandingBlocks | ||
expr: sum by (tenant) (tempodb_compaction_outstanding_blocks) / ignoring(tenant) group_left count(tempo_build_info) > 100 | ||
for: "6h" | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Tempo too many outstanding compaction blocks (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is having too many outstanding compaction blocks for tenant {{ $labels.tenant }}, increase compactor's CPU or add more compactors." | ||
- alert: TempoDiscardedSpans | ||
expr: 100 * sum by (instance,job)(rate(tempo_discarded_spans_total[5m])) / sum by (instance,job)(rate(tempo_distributor_spans_received_total[5m])) > 5 | ||
for: "5m" | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Tempo spans insertion failing (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% discard of spans." | ||
- alert: TempoDistributorPushLatency | ||
expr: histogram_quantile(0.99, sum by(le, job, instance) (rate(tempo_distributor_push_duration_seconds_bucket[5m]))) > 3 | ||
for: "5m" | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Tempo distributor push latency (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} distributor push is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." | ||
- alert: TempoDistributorUnhealthy | ||
expr: max by (job, instance)(tempo_ring_members{state="Unhealthy", name="distributor"}) > 0 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo unhealthy distributor(s) (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy distributor(s)." | ||
- alert: TempoFailedIngestingRequests | ||
expr: sum by (job,instance)(increase (tempo_ingester_traces_created_total[5m])) / sum by (instance,job)(rate(tempo_request_duration_seconds_count{route='/tempopb.Pusher/PushBytesV2'}[5m])) == 0 | ||
for: "5m" | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo pushing traces to ingester failing (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is experiencing failure in distributors pushing traces to the ingesters." | ||
- alert: TempoFrontendClients | ||
expr: tempo_query_frontend_connected_clients == 0 | ||
for: "5m" | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo frontend connected clients (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} has no frontend connected clients." | ||
- alert: TempoFrontendQueueLatency | ||
expr: histogram_quantile(0.99, sum by(le,instance,job) (rate(tempo_query_frontend_queue_duration_seconds_bucket[15m]))) > 2 | ||
for: "15m" | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Tempo frontend queue latency (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} frontend queue is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." | ||
- alert: TempoIngesterFlushLatency | ||
expr: histogram_quantile(0.99, sum by(le,instance,job) (rate(tempo_ingester_flush_duration_seconds_bucket[5m]))) > 5 | ||
for: "5m" | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo ingester flush latency (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} ingester flush is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." | ||
- alert: TempoIngesterFlushesFailing | ||
expr: sum by (instance,job)(increase(tempo_ingester_flush_failed_retries_total[1h])) > 2 and sum by(instance,job)(increase(tempo_ingester_flush_failed_retries_total[5m])) > 0 | ||
for: 5m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo ingester flush retries failing (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is experiencing more than 2 flush retries failures in the past hour." | ||
- alert: TempoIngesterFlushesUnhealthy | ||
expr: sum by (instance,job)(increase(tempo_ingester_failed_flushes_total[1h])) > 2 and sum by (instance,job)(increase(tempo_ingester_failed_flushes_total[5m])) > 0 | ||
for: 5m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Tempo ingester flush failing (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is experiencing more than 2 ingester flush failures in the past hour." | ||
- alert: TempoIngestersUnhealthy | ||
expr: max by (instance,job)(tempo_ring_members{state="Unhealthy", name="ingester"}) > 0 | ||
for: "15m" | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo unhealthy ingester(s) (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is having {{ printf \"%f\" $value }} unhealthy ingester(s)." | ||
- alert: TempoKVRequestErrors | ||
expr: 100 * sum(rate(tempo_kv_request_duration_seconds_count{status_code=~"5.."}[5m])) by (route,instance,job) / sum(rate(tempo_kv_request_duration_seconds_count[5m])) by (route,instance,job) > 10 | ||
for: "15m" | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo kv store request errors (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} {{ $labels.route }} KV store requests is experiencing {{ printf \"%.2f\" $value }}% error rate." | ||
- alert: TempoTargetMissing | ||
expr: up == 0 | ||
for: 0m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Prometheus target missing (instance {{ $labels.instance }}) | ||
description: "A Prometheus target has disappeared. An exporter might be crashed." | ||
- alert: TempoNoTenantIndexBuilders | ||
expr: sum by (tenant,job,instance) (tempodb_blocklist_tenant_index_builder) == 0 and max by (tenant,job,instance)(tempodb_blocklist_length) > 0 | ||
for: 5m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo tenant index builder failing (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is having no tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale." | ||
- alert: TempoRequestErrors | ||
expr: 100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[5m])) by (route,job,instance) / sum(rate(tempo_request_duration_seconds_count[5m])) by (route,job,instance) > 10 | ||
for: "15m" | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo request errors (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% request error rate." | ||
- alert: TempoRequestLatency | ||
expr: histogram_quantile(0.99, sum by(le, route,job,instance)(rate(tempo_request_duration_seconds_bucket[5m]))) > 5 | ||
for: 5m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo request latency (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency." | ||
- alert: TempoRetentionsFailing | ||
expr: sum by (job,instance)(increase(tempodb_retention_errors_total[1h])) > 2 and sum by (job,instance)(increase(tempodb_retention_errors_total[5m])) > 0 | ||
for: "5m" | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo retentions failing (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is experiencing more than 2 retention failures in the past hour." | ||
- alert: TempoTCPConnectionsLimit | ||
expr: 100 * tempo_tcp_connections / tempo_tcp_connections_limit >= 80 and tempo_tcp_connections_limit > 0 | ||
for: "5m" | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: Tempo reaching max number of tcp connections (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is reaching {{ printf \"%.2f\" $value }}% of max tcp {{ $labels.protocol }} connections." | ||
- alert: TempoTenantIndexTooOld | ||
expr: max by(tenant,instance,job) (tempodb_blocklist_tenant_index_age_seconds) > 600 | ||
for: 5m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo tenant old index (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is experiencing a tenant {{ $labels.tenant }} with a too old index age of 600 seconds." | ||
- alert: TempoUserConfigurableOverridesReloadFailing | ||
expr: sum by (instance,job)(increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (instance,job)(increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[5m])) > 0 | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: Tempo user config override reload failing (instance {{ $labels.instance }}) | ||
description: "The {{ $labels.job }} is experiencing more than 5 user-configurable override reload failures in the past hour." |
This file was deleted.
Oops, something went wrong.