address more PR comments

grafana · Mar 17, 2023 · 5e96d7d · 5e96d7d
1 parent bc87356
commit 5e96d7d
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 47 deletions.
diff --git a/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py b/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py
@@ -105,13 +105,6 @@ def next_user_in_sorted_queue(self) -> User:
         next_user = users_queue[(last_user_index + 1) % len(users_queue)]
         return next_user
 
-    def has_triggered_log_record(self, alert_group_id: int) -> bool:
-        return AlertGroupLogRecord.objects.filter(
-            escalation_policy_id=self.id,
-            alert_group_id=alert_group_id,
-            type=AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED,
-        ).exists()
-
     def execute(self, alert_group, reason) -> StepExecutionResultData:
         action_map = {
             EscalationPolicy.STEP_WAIT: self._escalation_step_wait,

diff --git a/engine/apps/alerts/tasks/check_escalation_finished.py b/engine/apps/alerts/tasks/check_escalation_finished.py
@@ -30,6 +30,8 @@ def send_alert_group_escalation_auditor_task_heartbeat() -> None:
 
 
 def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
+    AlertGroupLogRecord = apps.get_model("alerts", "AlertGroupLogRecord")
+
     escalation_snapshot = alert_group.escalation_snapshot
     alert_group_id = alert_group.id
     base_msg = f"Alert group {alert_group_id}"
@@ -69,18 +71,29 @@ def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
         f"{base_msg}'s escalation snapshot has {num_of_executed_escalation_policy_snapshots} executed escalation policies"
     )
 
-    for executed_escalation_policy_snapshot in executed_escalation_policy_snapshots:
-        escalation_policy_id = executed_escalation_policy_snapshot.id
-
-        if not executed_escalation_policy_snapshot.has_triggered_log_record(alert_group_id):
-            raise AlertGroupEscalationPolicyExecutionAuditException(
-                f"{base_msg}'s escalation policy snapshot {escalation_policy_id} does not have a triggered alert group log record associated with it"
-            )
+    # compare number of triggered/failed alert group log records to the number of executed
+    # escalation policy snapshot steps
+    num_of_relevant_log_records = AlertGroupLogRecord.objects.filter(
+        alert_group_id=alert_group_id,
+        type__in=[AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, AlertGroupLogRecord.TYPE_ESCALATION_FAILED],
+    ).count()
 
-        task_logger.info(
-            f"{base_msg}'s escalation policy snapshot {escalation_policy_id} has a triggered alert group log record associated with it"
+    if num_of_relevant_log_records < num_of_executed_escalation_policy_snapshots:
+        raise AlertGroupEscalationPolicyExecutionAuditException(
+            f"{base_msg}'s number of triggered/failed alert group log records ({num_of_relevant_log_records}) is less "
+            f"than the number of executed escalation policy snapshot steps ({num_of_executed_escalation_policy_snapshots})"
         )
 
+    task_logger.info(
+        f"{base_msg}'s number of triggered/failed alert group log records ({num_of_relevant_log_records}) is greater "
+        f"than or equal to the number of executed escalation policy snapshot steps ({num_of_executed_escalation_policy_snapshots})"
+    )
+
+    # TODO: check following steps:
+    # STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW
+    # STEP_REPEAT_ESCALATION_N_TIMES
+    # not sure what to do here?
+
     task_logger.info(f"{base_msg} passed the audit checks")
 
 
@@ -99,7 +112,7 @@ def get_auditable_alert_groups_started_at_range() -> typing.Tuple[datetime.datet
     alert groups, whose integration did not have an escalation chain at the time the alert group was created
     we would raise errors
     """
-    return (datetime.datetime(2023, 3, 5), timezone.now() - timezone.timedelta(days=2))
+    return (datetime.datetime(2023, 3, 25), timezone.now() - timezone.timedelta(days=2))
 
 
 # don't retry this task as the AlertGroup DB query is rather expensive

diff --git a/engine/apps/alerts/tests/test_check_escalation_finished_task.py b/engine/apps/alerts/tests/test_check_escalation_finished_task.py
@@ -1,3 +1,4 @@
+import random
 from unittest.mock import Mock, PropertyMock, patch
 
 import pytest
@@ -16,6 +17,10 @@
 MOCKED_HEARTBEAT_URL = "https://hello.com/lsdjjkf"
 
 
+def _get_relevant_log_record_type() -> int:
+    return random.choice([AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, AlertGroupLogRecord.TYPE_ESCALATION_FAILED])
+
+
 def test_send_alert_group_escalation_auditor_task_heartbeat_does_not_call_the_heartbeat_url_if_one_is_not_configured():
     with patch("apps.alerts.tasks.check_escalation_finished.requests") as mock_requests:
         send_alert_group_escalation_auditor_task_heartbeat()
@@ -127,9 +132,9 @@ def test_audit_alert_group_escalation_all_executed_escalation_policy_snapshots_h
 
     for escalation_policy_snapshot in escalation_policies_snapshots:
         escalation_policy = EscalationPolicy.objects.get(id=escalation_policy_snapshot.id)
-        make_alert_group_log_record(
-            alert_group, AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, user, escalation_policy=escalation_policy
-        )
+        log_record_type = _get_relevant_log_record_type()
+
+        make_alert_group_log_record(alert_group, log_record_type, user, escalation_policy=escalation_policy)
 
     with patch(
         "apps.alerts.escalation_snapshot.snapshot_classes.escalation_snapshot.EscalationSnapshot.executed_escalation_policy_snapshots",
@@ -150,12 +155,12 @@ def test_audit_alert_group_escalation_one_executed_escalation_policy_snapshot_do
     alert_group, _, _, _ = escalation_snapshot_test_setup
     escalation_policies_snapshots = alert_group.escalation_snapshot.escalation_policies_snapshots
 
-    # let's skip creating a TRIGGERED alert group log record for the first executed escalation policy
+    # let's skip creating a relevant alert group log record for the first executed escalation policy
     for idx, escalation_policy_snapshot in enumerate(escalation_policies_snapshots):
         if idx != 0:
             escalation_policy = EscalationPolicy.objects.get(id=escalation_policy_snapshot.id)
             make_alert_group_log_record(
-                alert_group, AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, user, escalation_policy=escalation_policy
+                alert_group, _get_relevant_log_record_type(), user, escalation_policy=escalation_policy
             )
 
     with patch(

diff --git a/engine/apps/alerts/tests/test_escalation_policy_snapshot.py b/engine/apps/alerts/tests/test_escalation_policy_snapshot.py
@@ -568,28 +568,3 @@ def test_escalation_step_with_deleted_user(
 
     deserialized_escalation_snapshot = EscalationPolicySnapshotSerializer().to_internal_value(raw_snapshot)
     assert deserialized_escalation_snapshot["notify_to_users_queue"] == [user]
-
-
-@pytest.mark.django_db
-@pytest.mark.parametrize(
-    "log_record_type,expected",
-    [
-        (AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, True),
-        (AlertGroupLogRecord.TYPE_ESCALATION_FAILED, False),
-        (AlertGroupLogRecord.TYPE_ESCALATION_FINISHED, False),
-    ],
-)
-def has_triggered_log_record(
-    escalation_step_test_setup, make_escalation_policy, make_alert_group_log_record, log_record_type, expected
-):
-    _, user, _, channel_filter, alert_group, _ = escalation_step_test_setup
-    wait_step = make_escalation_policy(
-        escalation_chain=channel_filter.escalation_chain,
-        escalation_policy_step=EscalationPolicy.STEP_WAIT,
-        wait_delay=EscalationPolicy.FIFTEEN_MINUTES,
-    )
-
-    make_alert_group_log_record(alert_group, log_record_type, user, escalation_policy=wait_step)
-
-    escalation_policy_snapshot = get_escalation_policy_snapshot_from_model(wait_step)
-    assert escalation_policy_snapshot.has_triggered_log_record() == expected