getsentry · relaxolotl · Oct 2, 2021 · Sep 21, 2021 · Sep 22, 2021 · Sep 22, 2021
diff --git a/src/sentry/conf/server.py b/src/sentry/conf/server.py
@@ -562,6 +562,7 @@ def SOCIAL_AUTH_DEFAULT_USERNAME():
     "sentry.tasks.files",
     "sentry.tasks.groupowner",
     "sentry.tasks.integrations",
+    "sentry.tasks.low_priority_symbolication",
     "sentry.tasks.members",
     "sentry.tasks.merge",
     "sentry.tasks.releasemonitor",
@@ -638,6 +639,10 @@ def SOCIAL_AUTH_DEFAULT_USERNAME():
     Queue("sleep", routing_key="sleep"),
     Queue("stats", routing_key="stats"),
     Queue("subscriptions", routing_key="subscriptions"),
+    Queue(
+        "symbolications.compute_low_priority_projects",
+        routing_key="symbolications.compute_low_priority_projects",
+    ),
     Queue("unmerge", routing_key="unmerge"),
     Queue("update", routing_key="update"),
 ]
@@ -779,6 +784,11 @@ def create_partitioned_queues(name):
         "schedule": timedelta(minutes=20),
         "options": {"expires": 20 * 60},
     },
+    "check-symbolicator-lpq-project-eligibility": {
+        "task": "sentry.tasks.low_priority_symbolication.scan_for_suspect_projects",
+        "schedule": timedelta(seconds=10),
+        "options": {"expires": 10},
+    },
 }
 
 BGTASKS = {

@@ -1,8 +1,38 @@
-from typing import Set
+import dataclasses
+from typing import Dict, Iterable, NewType, Set
 
 from sentry.utils.services import Service
 
 
+@dataclasses.dataclass(frozen=True)
+class BucketedCount:
+    """
+    Timestamp to count mapping. This represents some `count` amount of something performed
+    during `timestamp`. `timestamp` is stored in seconds.
+    """
+
+    timestamp: int
+    count: int
+
+
+# Duration to count mapping where the keys are durations and the values are counts. This represents
+# some `count` instances of some action where each individual instance some
+# [`duration`, `duration`+10) seconds of time to complete. `duration` is stored in seconds.
+BucketedDurations = NewType("BucketedDurations", Dict[int, int])
+
+
+@dataclasses.dataclass(frozen=True)
+class DurationHistogram:
+    """
+    Mapping of timestamp to histogram-like dict of durations. This represents some `count` amount of
+    some action performed during `timestamp`, where `counts` are grouped by how long that action
+    took. `timestamp` is stored in seconds.
+    """
+
+    timestamp: int
+    histogram: BucketedDurations
+
+
 class RealtimeMetricsStore(Service):  # type: ignore
     """A service for storing metrics about incoming requests within a given time window."""
 
@@ -23,7 +53,7 @@ def increment_project_event_counter(self, project_id: int, timestamp: int) -> No
         time-window bucket with "timestamp" providing the time of the event
         in seconds since the UNIX epoch (i.e., as returned by time.time()).
         """
-        pass
+        raise NotImplementedError
 
     def increment_project_duration_counter(
         self, project_id: int, timestamp: int, duration: int
@@ -34,32 +64,67 @@ def increment_project_duration_counter(
         Calling this increments the counter of the current time-window bucket with "timestamp" providing
         the time of the event in seconds since the UNIX epoch and "duration" the processing time in seconds.
         """
-        pass
+        raise NotImplementedError
+
+    def projects(self) -> Iterable[int]:
+        """
+        Returns IDs of all projects that should be considered for the low priority queue.
+        """
-        """
-        Returns IDs of all projects that should be considered for the low priority queue.
-        """
+        """
+        Returns IDs of all projects for which metrics have been recorded in the store.
+        """
-        """
-        Returns IDs of all projects that should be considered for the low priority queue.
-        """
+        """
+        Returns IDs of all projects for which metrics have been recorded in the store.
+        """
+        raise NotImplementedError
+
+    def get_counts_for_project(self, project_id: int) -> Iterable[BucketedCount]:
+        """
+        Returns a sorted list of bucketed timestamps paired with the count of symbolicator requests
+        made during that time for some given project.
+        """
+        raise NotImplementedError
+
+    def get_durations_for_project(self, project_id: int) -> Iterable[DurationHistogram]:
+        """
+        Returns a sorted list of bucketed timestamps paired with a dictionary of symbolicator
+        durations grouped in 10 second durations made during that time for some given project.
+        """
+        raise NotImplementedError
 
     def get_lpq_projects(self) -> Set[int]:
         """
         Fetches the list of projects that are currently using the low priority queue.
 
         Returns a list of project IDs.
         """
-        pass
+        raise NotImplementedError
+
+    def add_project_to_lpq(self, project_id: int) -> bool:
+        """
+        Assigns a project to the low priority queue.
+
+        This registers an intent to redirect all symbolication events triggered by the specified
+        project to be redirected to the low priority queue.
+
+        Returns True if the project was a new addition to the list. Returns False if it was already
+        assigned to the low priority queue.
+        """
+        raise NotImplementedError
 
-    def add_project_to_lpq(self, project_id: int) -> None:
+    def remove_project_from_lpq(self, project_id: int) -> bool:
         """
-        Moves a project to the low priority queue.
+        Removes a project from the low priority queue.
+
+        This registers an intent to restore all specified projects back to the regular queue.
 
-        This forces all symbolication events triggered by the specified project to be redirected to
-        the low priority queue, unless the project is manually excluded from the low priority queue
-        via the `store.symbolicate-event-lpq-never` kill switch.
+        Returns True if the project was assigned to the queue prior to its removal. Returns False if
+        it wasn't assigned to the queue to begin with.
         """
-        pass
+        raise NotImplementedError
 
-    def remove_projects_from_lpq(self, project_ids: Set[int]) -> None:
+    def remove_projects_from_lpq(self, project_ids: Set[int]) -> int:
         """
         Removes projects from the low priority queue.
 
-        This restores all specified projects back to the regular queue, unless they have been
-        manually forced into the low priority queue via the `store.symbolicate-event-lpq-always`
-        kill switch.
+        This registers an intent to restore all specified projects back to the regular queue.
+
+        Returns the number of projects that were actively removed from the queue. Any projects that
+        were not assigned to the low priority queue to begin with will be omitted from the return
+        value.
         """
-        pass
+        raise NotImplementedError
@@ -1,5 +1,7 @@
 import datetime
-from typing import Set
+import logging
+from itertools import chain
+from typing import Iterable, Set
 
 from sentry.exceptions import InvalidConfiguration
 from sentry.utils import redis
@@ -9,6 +11,8 @@
 # redis key for entry storing current list of LPQ members
 LPQ_MEMBERS_KEY = "store.symbolicate-event-lpq-selected"
 
+logger = logging.getLogger(__name__)
+
 
 class RedisRealtimeMetricsStore(base.RealtimeMetricsStore):
     """An implementation of RealtimeMetricsStore based on a Redis backend."""
@@ -46,6 +50,12 @@ def validate(self) -> None:
         if self._histogram_bucket_size <= 0:
             raise InvalidConfiguration("histogram bucket size must be at least 1")
 
+    def _counter_key_prefix(self) -> str:
+        return f"{self._prefix}:counter:{self._counter_bucket_size}"
+
+    def _histogram_key_prefix(self) -> str:
+        return f"{self._prefix}:histogram:{self._histogram_bucket_size}"
+
     def increment_project_event_counter(self, project_id: int, timestamp: int) -> None:
         """Increment the event counter for the given project_id.
 
@@ -58,7 +68,7 @@ def increment_project_event_counter(self, project_id: int, timestamp: int) -> No
         if self._counter_bucket_size > 1:
             timestamp -= timestamp % self._counter_bucket_size
 
-        key = f"{self._prefix}:counter:{self._counter_bucket_size}:{project_id}:{timestamp}"
+        key = f"{self._counter_key_prefix()}:{project_id}:{timestamp}"
 
         with self.cluster.pipeline() as pipeline:
             pipeline.incr(key)
@@ -77,23 +87,112 @@ def increment_project_duration_counter(
         if self._histogram_bucket_size > 1:
             timestamp -= timestamp % self._histogram_bucket_size
 
-        key = f"{self._prefix}:histogram:{self._histogram_bucket_size}:{project_id}:{timestamp}"
+        key = f"{self._histogram_key_prefix()}:{project_id}:{timestamp}"
         duration -= duration % 10
 
         with self.cluster.pipeline() as pipeline:
             pipeline.hincrby(key, duration, 1)
             pipeline.pexpire(key, self._histogram_ttl)
             pipeline.execute()
 
+    def projects(self) -> Iterable[int]:
+        """
+        Returns IDs of all projects for which metrics have been recorded in the store.
+
+        This may throw an exception if there is some sort of issue scanning the redis store for
+        projects.
+        """
+
+        already_seen = set()
+        # Normally if there's a histogram entry for a project then there should be a counter
+        # entry for it as well, but double check both to be safe
+        all_keys = chain(
+            self.cluster.scan_iter(
+                match=self._counter_key_prefix() + ":*",
+            ),
+            self.cluster.scan_iter(
+                match=self._histogram_key_prefix() + ":*",
+            ),
+        )
+
+        for item in all_keys:
+            # Because this could be one of two patterns, this splits based on the most basic
+            # delimiter ":" instead of splitting on known prefixes
+            _prefix, _metric_type, _bucket_size, project_id_raw, _else = item.split(":", maxsplit=4)
+            project_id = int(project_id_raw)
+            if project_id not in already_seen:
+                already_seen.add(project_id)
+                yield project_id
+
+    def get_counts_for_project(self, project_id: int) -> Iterable[base.BucketedCount]:
+        """
+        Returns a sorted list of bucketed timestamps paired with the count of symbolicator requests
+        made during that time for some given project.
+
+        This may throw an exception if there is some sort of issue fetching counts from the redis
+        store.
+        """
+        key_prefix = f"{self._counter_key_prefix()}:{project_id}:"
+
+        keys = sorted(
+            self.cluster.scan_iter(
+                match=key_prefix + "*",
+            )
+        )
+        counts = self.cluster.mget(keys)
+        for key, count_raw in zip(keys, counts):
+            _, timestamp_raw = key.split(key_prefix)
+
+            timestamp_bucket = int(timestamp_raw)
+            count = int(count_raw)
+            yield base.BucketedCount(timestamp=timestamp_bucket, count=count)
+
+    def get_durations_for_project(self, project_id: int) -> Iterable[base.DurationHistogram]:
+        """
+        Returns a sorted list of bucketed timestamps paired with a histogram-like dictionary of
+        symbolication durations made during some timestamp for some given project.
+
+        For a given `{duration:count}` entry in the dictionary bound to a specific `timestamp`:
+
+        - `duration` represents the amount of time it took for a symbolication request to complete.
+        Durations are bucketed by 10secs, meaning that a `duration` of `30` covers all requests that
+        took between 30-39 seconds.
+
+        - `count` is the number of symbolication requests that took some amount of time within the
+        range of `[duration, duration+10)` to complete.
+
+        This may throw an exception if there is some sort of issue fetching durations from the redis
+        store.
+        """
+        key_prefix = f"{self._histogram_key_prefix()}:{project_id}:"
+        keys = sorted(
+            self.cluster.scan_iter(
+                match=key_prefix + "*",
+            )
+        )
+
+        for key in keys:
+            _, timestamp_raw = key.split(key_prefix)
+            timestamp_bucket = int(timestamp_raw)
+
+            histogram_raw = self.cluster.hgetall(key)
+            histogram = base.BucketedDurations(
+                {int(duration): int(count) for duration, count in histogram_raw.items()}
+            )
+            yield base.DurationHistogram(timestamp=timestamp_bucket, histogram=histogram)
+
     def get_lpq_projects(self) -> Set[int]:
         """
         Fetches the list of projects that are currently using the low priority queue.
 
         Returns a list of project IDs.
+
+        This may throw an exception if there is some sort of issue fetching the list from the redis
+        store.
         """
         return {int(project_id) for project_id in self.cluster.smembers(LPQ_MEMBERS_KEY)}
 
-    def add_project_to_lpq(self, project_id: int) -> None:
+    def add_project_to_lpq(self, project_id: int) -> bool:
         """
         Assigns a project to the low priority queue.
 
@@ -105,11 +204,25 @@ def add_project_to_lpq(self, project_id: int) -> None:
         """
 
         # This returns 0 if project_id was already in the set, 1 if it was added, and throws an
-        # exception if there's a problem so it's fine if we just ignore the return value of this as
-        # the project is always added if this successfully completes.
-        self.cluster.sadd(LPQ_MEMBERS_KEY, project_id)
+        # exception if there's a problem. If this successfully completes then the project is
+        # expected to be in the set.
+        return int(self.cluster.sadd(LPQ_MEMBERS_KEY, project_id)) > 0
+
+    def remove_project_from_lpq(self, project_id: int) -> bool:
+        """
+        Removes a project from the low priority queue.
+
+        This restores the specified project back to the regular queue, unless it has been
+        manually forced into the low priority queue via the `store.symbolicate-event-lpq-always`
+        kill switch.
+
+        This may throw an exception if there is some sort of issue deregistering the projects from
+        the queue.
+        """
+
+        return self.remove_projects_from_lpq({project_id}) > 0
 
-    def remove_projects_from_lpq(self, project_ids: Set[int]) -> None:
+    def remove_projects_from_lpq(self, project_ids: Set[int]) -> int:
         """
         Removes projects from the low priority queue.
 
@@ -119,6 +232,8 @@ def remove_projects_from_lpq(self, project_ids: Set[int]) -> None:
         the queue.
         """
         if len(project_ids) == 0:
-            return
+            return 0
 
-        self.cluster.srem(LPQ_MEMBERS_KEY, *project_ids)
+        # This returns the number of projects removed, and throws an exception if there's a problem.
+        # If this successfully completes then the projects are expected to no longer be in the set.
+        return int(self.cluster.srem(LPQ_MEMBERS_KEY, *project_ids))