Skip to content

Commit

Permalink
Merge pull request #14619 from MinaProtocol/alert/fix-stuck-in-catchu…
Browse files Browse the repository at this point in the history
…p-and-bootstrap

fix StuckInCatchup/Bootstrap alerts
  • Loading branch information
ghost-not-in-the-shell authored Dec 18, 2023
2 parents 944d28e + 2513346 commit 605b8c8
Showing 1 changed file with 19 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -255,25 +255,6 @@ groups:
description: "{{ $value }} blocks have been validated on network {{ $labels.testnet }} in the last hour (according to some node)."
runbook: "https://www.notion.so/minaprotocol/FewBlocksPerHour-47a6356f093242d988b0d9527ce23478"

- alert: StuckInBootstrap
expr: count by (testnet) (increase(Coda_Runtime_process_uptime_ms_total{syncStatus = "BOOTSTRAP"}[2h]) >= 7200000) > 0
for: ${alert_evaluation_duration}
labels:
testnet: "{{ $labels.testnet }}"
severity: critical
annotations:
summary: "One or more {{ $labels.testnet }} nodes are stuck at bootstrap for more than 2 hours"

- alert: StuckInCatchup
expr: count by (testnet) (increase(Coda_Runtime_process_uptime_ms_total{syncStatus = "CATCHUP"}[2h]) >= 7200000) > 0
for: ${alert_evaluation_duration}
labels:
testnet: "{{ $labels.testnet }}"
severity: critical
annotations:
summary: "One or more {{ $labels.testnet }} nodes are stuck at catchup for more than 2 hours"


- name: Warnings
rules:
- alert: HighBlockGossipLatency
Expand Down Expand Up @@ -638,7 +619,25 @@ groups:
summary: "One or more {{ $labels.testnet }} nodes are stuck at an old block height (Observed block height did not increase in the last 30m)"
description: "{{ $value }} blocks have been validated on network {{ $labels.testnet }} in the last hour (according to some node)."
runbook: "https://www.notion.so/minaprotocol/FewBlocksPerHour-47a6356f093242d988b0d9527ce23478"


- alert: StuckInBootstrap
expr: max by (testnet) (increase(Coda_Runtime_process_uptime_ms_total{${berkeley_testnet},syncStatus = "BOOTSTRAP"}[2h])) >= 6000000
for: ${alert_evaluation_duration}
labels:
testnet: "{{ $labels.testnet }}"
severity: critical
annotations:
summary: "One or more {{ $labels.testnet }} nodes are stuck at bootstrap for more than 100 mins within the recent 2 hours"

- alert: StuckInCatchup
expr: max by (testnet) (increase(Coda_Runtime_process_uptime_ms_total{${berkeley_testnet},syncStatus = "CATCHUP"}[2h])) >= 6000000
for: ${alert_evaluation_duration}
labels:
testnet: "{{ $labels.testnet }}"
severity: critical
annotations:
summary: "One or more {{ $labels.testnet }} nodes are stuck at catchup for more than 100 mins within the recent 2 hours"

- alert: HighBlockGossipLatency
expr: max by (testnet) (max_over_time(Coda_Block_latency_gossip_time {${berkeley_testnet},${synced_status_filter}} [${alert_timeframe}])) > 200
for: ${alert_evaluation_duration}
Expand Down

0 comments on commit 605b8c8

Please sign in to comment.