Skip to content

Commit

Permalink
fix(metric_alerts): Fix bug where transitioning from critical -> warn…
Browse files Browse the repository at this point in the history
…ing can still trigger a resolve action (#29345)

Was investigating #27161, although I suspect that it's
already resolved since it's from back in July and we pushed out a fix in Auguest.

I happened to figure out the error we see on our own alert in #team-engineering though. Just to
cover this issue again:
 - Metric goes above the critical threshold, so we fire a critical alert
 - Metric goes below critical but remains above warning. This should result in firing the warning
   threshold again, but actually it triggers a resolve.

There are two parts to this issue.
1. Our metric alert is misconfigured, and doesn't have an action set up for warning. So warning
thresholds will literally never fire or do anything. I have fixed this.
2. If the critical/warning actions are different, then the de-duping that we have in place won't
work. In our case, since critical has an action and warning doesn't, we end up just using the
critical action, which is tied to the resolve. This results in us showing a resolve notification.

To fix this, when we transition from critical -> warning I only include the incident trigger
associated with the active warning trigger in `fired_incident_triggers`, and leave the critical
trigger out. This means that if someone configures an alert with both warning and critical triggers,
but fails to set up an action on the warning trigger, then the only time the alert will fire is when
the critical alert initially fires.

I think this is most correct and realistically it doesn't make sense to set up a threshold without
an action. We should probably prevent alerts from being created like this in the UI.
  • Loading branch information
wedamija authored Oct 15, 2021
1 parent 5745efd commit 654d319
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 12 deletions.
15 changes: 7 additions & 8 deletions src/sentry/incidents/subscription_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,10 @@ def calculate_resolve_threshold(self, trigger):

return trigger.alert_threshold + resolve_add

def find_and_fire_active_warning_trigger(self, alert_operator, aggregation_value):
def find_active_warning_trigger(self, alert_operator, aggregation_value):
"""
Function used to re-fire a Warning trigger when making the transition from Critical to
Warning
Finds and returns an active warning trigger, if one exists on this alert
"""
active_warning_it = None
for it in self.incident_triggers.values():
current_trigger = it.alert_rule_trigger
# Check if there is a Warning incident trigger that is active, and then check if the
Expand All @@ -167,8 +165,7 @@ def find_and_fire_active_warning_trigger(self, alert_operator, aggregation_value
and alert_operator(aggregation_value, current_trigger.alert_threshold)
):
metrics.incr("incidents.alert_rules.threshold", tags={"type": "alert"})
active_warning_it = self.trigger_alert_threshold(current_trigger, aggregation_value)
return active_warning_it
return it

def get_comparison_aggregation_value(self, subscription_update, aggregation_value):
# For comparison alerts run a query over the comparison period and use it to calculate the
Expand Down Expand Up @@ -364,17 +361,19 @@ def process_update(self, subscription_update):
# warning threshold, and so we will check if we are above the warning
# threshold and if so fire a warning alert
# This is mainly for handling transition from Critical -> Warning
active_warning_it = None
if (
incident_trigger.alert_rule_trigger.label == CRITICAL_TRIGGER_LABEL
and self.incident_triggers
):
active_warning_it = self.find_and_fire_active_warning_trigger(
active_warning_it = self.find_active_warning_trigger(
alert_operator=alert_operator, aggregation_value=aggregation_value
)
if active_warning_it is not None:
fired_incident_triggers.append(active_warning_it)

fired_incident_triggers.append(incident_trigger)
if active_warning_it is None:
fired_incident_triggers.append(incident_trigger)
else:
self.trigger_resolve_counts[trigger.id] = 0

Expand Down
110 changes: 106 additions & 4 deletions tests/sentry/incidents/test_subscription_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,14 +283,16 @@ def assert_action_handler_called_with_actions(self, incident, actions, project=N

if not actions:
if not incident:
assert not self.email_action_handler.called
assert (
not self.email_action_handler.called
), self.email_action_handler.call_args_list
else:
for call_args in self.email_action_handler.call_args_list:
assert call_args[0][1] != incident
else:
self.email_action_handler.assert_has_calls(
[call(action, incident, project) for action in actions], any_order=True
)
assert self.email_action_handler.call_args_list == [
call(action, incident, project) for action in actions
]

def assert_actions_fired_for_incident(self, incident, actions=None, project=None):
actions = [] if actions is None else actions
Expand Down Expand Up @@ -810,6 +812,106 @@ def test_multiple_subscriptions_do_not_conflict(self):
self.assert_action_handler_called_with_actions(other_incident, [])

def test_multiple_triggers(self):
rule = self.rule
rule.update(threshold_period=1)
trigger = self.trigger
warning_trigger = create_alert_rule_trigger(
self.rule, WARNING_TRIGGER_LABEL, trigger.alert_threshold - 20
)
warning_action = create_alert_rule_trigger_action(
warning_trigger,
AlertRuleTriggerAction.Type.EMAIL,
AlertRuleTriggerAction.TargetType.USER,
str(self.user.id),
)
processor = self.send_update(
rule, warning_trigger.alert_threshold + 1, timedelta(minutes=-10), subscription=self.sub
)
self.assert_trigger_counts(processor, warning_trigger, 0, 0)
self.assert_trigger_counts(processor, trigger, 0, 0)
incident = self.assert_active_incident(rule, self.sub)
self.assert_trigger_exists_with_status(incident, warning_trigger, TriggerStatus.ACTIVE)
self.assert_trigger_does_not_exist(trigger)
self.assert_actions_fired_for_incident(incident, [warning_action])

processor = self.send_update(
rule, trigger.alert_threshold + 1, timedelta(minutes=-9), subscription=self.sub
)
self.assert_trigger_counts(processor, trigger, 0, 0)
self.assert_trigger_counts(processor, warning_trigger, 0, 0)
incident = self.assert_active_incident(rule, self.sub)
self.assert_trigger_exists_with_status(incident, warning_trigger, TriggerStatus.ACTIVE)
self.assert_trigger_exists_with_status(incident, trigger, TriggerStatus.ACTIVE)
self.assert_actions_fired_for_incident(incident, [self.action])

processor = self.send_update(
rule, trigger.alert_threshold - 1, timedelta(minutes=-7), subscription=self.sub
)
self.assert_trigger_counts(processor, trigger, 0, 0)
self.assert_trigger_counts(processor, warning_trigger, 0, 0)
incident = self.assert_active_incident(rule, self.sub)
self.assert_trigger_exists_with_status(incident, trigger, TriggerStatus.RESOLVED)
self.assert_trigger_exists_with_status(incident, warning_trigger, TriggerStatus.ACTIVE)
self.assert_actions_fired_for_incident(incident, [warning_action])

processor = self.send_update(
rule, rule.resolve_threshold - 1, timedelta(minutes=-6), subscription=self.sub
)
self.assert_trigger_counts(processor, trigger, 0, 0)
self.assert_trigger_counts(processor, warning_trigger, 0, 0)
self.assert_no_active_incident(rule, self.sub)
self.assert_trigger_exists_with_status(incident, trigger, TriggerStatus.RESOLVED)
self.assert_trigger_exists_with_status(incident, warning_trigger, TriggerStatus.RESOLVED)
self.assert_actions_resolved_for_incident(incident, [warning_action])

def test_multiple_triggers_no_warning_action(self):
rule = self.rule
rule.update(threshold_period=1)
trigger = self.trigger
warning_trigger = create_alert_rule_trigger(
self.rule, WARNING_TRIGGER_LABEL, trigger.alert_threshold - 20
)
processor = self.send_update(
rule, warning_trigger.alert_threshold + 1, timedelta(minutes=-10), subscription=self.sub
)
self.assert_trigger_counts(processor, warning_trigger, 0, 0)
self.assert_trigger_counts(processor, trigger, 0, 0)
incident = self.assert_active_incident(rule, self.sub)
self.assert_trigger_exists_with_status(incident, warning_trigger, TriggerStatus.ACTIVE)
self.assert_trigger_does_not_exist(trigger)
self.assert_action_handler_called_with_actions(None, [])

processor = self.send_update(
rule, trigger.alert_threshold + 1, timedelta(minutes=-9), subscription=self.sub
)
self.assert_trigger_counts(processor, trigger, 0, 0)
self.assert_trigger_counts(processor, warning_trigger, 0, 0)
incident = self.assert_active_incident(rule, self.sub)
self.assert_trigger_exists_with_status(incident, warning_trigger, TriggerStatus.ACTIVE)
self.assert_trigger_exists_with_status(incident, trigger, TriggerStatus.ACTIVE)
self.assert_actions_fired_for_incident(incident, [self.action])

processor = self.send_update(
rule, trigger.alert_threshold - 1, timedelta(minutes=-7), subscription=self.sub
)
self.assert_trigger_counts(processor, trigger, 0, 0)
self.assert_trigger_counts(processor, warning_trigger, 0, 0)
incident = self.assert_active_incident(rule, self.sub)
self.assert_trigger_exists_with_status(incident, trigger, TriggerStatus.RESOLVED)
self.assert_trigger_exists_with_status(incident, warning_trigger, TriggerStatus.ACTIVE)
self.assert_action_handler_called_with_actions(None, [])

processor = self.send_update(
rule, rule.resolve_threshold - 1, timedelta(minutes=-6), subscription=self.sub
)
self.assert_trigger_counts(processor, trigger, 0, 0)
self.assert_trigger_counts(processor, warning_trigger, 0, 0)
self.assert_no_active_incident(rule, self.sub)
self.assert_trigger_exists_with_status(incident, trigger, TriggerStatus.RESOLVED)
self.assert_trigger_exists_with_status(incident, warning_trigger, TriggerStatus.RESOLVED)
self.assert_action_handler_called_with_actions(None, [])

def test_multiple_triggers_threshold_period(self):
rule = self.rule
rule.update(threshold_period=2)
trigger = self.trigger
Expand Down

0 comments on commit 654d319

Please sign in to comment.