From fe5f13777490ce1f670cc4c7e9d1f7768f8634a2 Mon Sep 17 00:00:00 2001 From: Kirsten Garrison Date: Mon, 26 Oct 2020 16:27:20 -0700 Subject: [PATCH] metrics: Add 15m duration to MCDDrainError alert This will give cluster time to retry drain several times before sending an alert to minimize false positives. --- install/0000_90_machine-config-operator_01_prometheus-rules.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/install/0000_90_machine-config-operator_01_prometheus-rules.yaml b/install/0000_90_machine-config-operator_01_prometheus-rules.yaml index 2f219d6829..d570445ce4 100644 --- a/install/0000_90_machine-config-operator_01_prometheus-rules.yaml +++ b/install/0000_90_machine-config-operator_01_prometheus-rules.yaml @@ -23,6 +23,7 @@ spec: mcd_drain_err > 0 labels: severity: warning + for: 15m annotations: message: "Drain failed on {{ $labels.node }} , updates may be blocked. For more details: oc logs -f -n openshift-machine-config-operator machine-config-daemon- -c machine-config-daemon" - name: mcd-pivot-error