-
-
Notifications
You must be signed in to change notification settings - Fork 4.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(symbolicator): Record metrics for projects being assigned and unassigned to the LPQ #29229
Changes from all commits
2a2f386
ca6e5bd
bd62ecc
9ffce55
b025549
a40e857
99e55f0
306b4c1
d04cb34
447e8fb
b945b8a
061c3c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,9 +12,15 @@ | |
import time | ||
from typing import Iterable | ||
|
||
import sentry_sdk | ||
from typing_extensions import Literal | ||
|
||
from sentry import options | ||
from sentry.killswitches import normalize_value | ||
from sentry.processing import realtime_metrics | ||
from sentry.processing.realtime_metrics.base import BucketedCount, DurationHistogram | ||
from sentry.tasks.base import instrumented_task | ||
from sentry.utils import metrics | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
@@ -27,7 +33,10 @@ | |
) | ||
def scan_for_suspect_projects() -> None: | ||
"""Scans and updates the list of projects assigned to the low priority queue.""" | ||
_scan_for_suspect_projects() | ||
try: | ||
_scan_for_suspect_projects() | ||
finally: | ||
_record_metrics() | ||
|
||
|
||
def _scan_for_suspect_projects() -> None: | ||
|
@@ -49,8 +58,7 @@ def _scan_for_suspect_projects() -> None: | |
realtime_metrics.remove_projects_from_lpq(expired_projects) | ||
|
||
for project_id in expired_projects: | ||
# TODO: add metrics! | ||
logger.warning("Moved project out of symbolicator's low priority queue: %s", project_id) | ||
_report_change(project_id=project_id, change="removed", reason="no metrics") | ||
|
||
|
||
@instrumented_task( # type: ignore | ||
|
@@ -68,7 +76,10 @@ def update_lpq_eligibility(project_id: int, cutoff: int) -> None: | |
should consider when calculating a project's eligibility. In other words, only data recorded | ||
before `cutoff` should be considered. | ||
""" | ||
_update_lpq_eligibility(project_id, cutoff) | ||
try: | ||
_update_lpq_eligibility(project_id, cutoff) | ||
finally: | ||
_record_metrics() | ||
|
||
|
||
def _update_lpq_eligibility(project_id: int, cutoff: int) -> None: | ||
|
@@ -82,11 +93,60 @@ def _update_lpq_eligibility(project_id: int, cutoff: int) -> None: | |
if is_eligible: | ||
was_added = realtime_metrics.add_project_to_lpq(project_id) | ||
if was_added: | ||
logger.warning("Moved project to symbolicator's low priority queue: %s", project_id) | ||
elif not is_eligible: | ||
_report_change(project_id=project_id, change="added", reason="eligible") | ||
else: | ||
was_removed = realtime_metrics.remove_projects_from_lpq({project_id}) | ||
if was_removed: | ||
logger.warning("Moved project out of symbolicator's low priority queue: %s", project_id) | ||
_report_change(project_id=project_id, change="removed", reason="ineligible") | ||
|
||
|
||
def _report_change(project_id: int, change: Literal["added", "removed"], reason: str) -> None: | ||
if not reason: | ||
reason = "unknown" | ||
Comment on lines
+104
to
+105
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why did you make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i don't really have any good reason 🙃 fixed up the types, thanks for catching this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i'm keeping this check in case an empty string does get passed in though |
||
|
||
if change == "added": | ||
message = "Added project to symbolicator's low priority queue" | ||
else: | ||
message = "Removed project from symbolicator's low priority queue" | ||
|
||
with sentry_sdk.push_scope() as scope: | ||
scope.set_level("warning") | ||
scope.set_tag("project", project_id) | ||
scope.set_tag("reason", reason) | ||
sentry_sdk.capture_message(message) | ||
|
||
|
||
def _record_metrics() -> None: | ||
project_count = len(realtime_metrics.get_lpq_projects()) | ||
metrics.gauge( | ||
"tasks.store.symbolicate_event.low_priority.projects.auto", | ||
project_count, | ||
) | ||
|
||
# The manual kill switch is a list of configurations where each config item corresponds to one | ||
# project affected by the switch. The general idea is to grab the raw option, validate its | ||
# contents, and then assume that the length of the validated list corresponds to the number of | ||
# projects in that switch. | ||
|
||
always_included_raw = options.get( | ||
"store.symbolicate-event-lpq-always", | ||
) | ||
always_included = len( | ||
normalize_value("store.symbolicate-event-lpq-always", always_included_raw) | ||
) | ||
metrics.gauge( | ||
"tasks.store.symbolicate_event.low_priority.projects.manual.always", | ||
always_included, | ||
) | ||
|
||
never_included_raw = options.get( | ||
"store.symbolicate-event-lpq-never", | ||
) | ||
never_included = len(normalize_value("store.symbolicate-event-lpq-never", never_included_raw)) | ||
metrics.gauge( | ||
"tasks.store.symbolicate_event.low_priority.projects.manual.never", | ||
never_included, | ||
) | ||
|
||
|
||
def calculation_magic( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these same metrics are now emitted a lot:
I think there are two approaches to this:
After reading option 2 I really don't feel so bad about choosing option 1 anymore.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, as you've noted one of the metrics becomes "eventually" correct and needs a bit of time to update itself.
it's unfortunate that we're leaking this implementation out into our metrics, but it looks like the best compromise, especially since the scanning doesn't require a final synchronization step to do its job properly. as you've probably noticed, that makes 2 hard to commit to just to ensure our metrics are less "jiggly" after a scan.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i was kind of arguing for removing this line to not emit so many duplicate metrics... could we still do that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do you have an example of an instance where an emitted metric is considered to be a duplicate? something to consider is that there is no other place where all of these metrics are being emitted, which means that when a manual kill switch's value is updated, this is the only place that would catch it. if these metrics are only written whenever some project is added or removed to the manual list, then given how frequently we expect projects to enter that list it would mean that it's highly likely that manual changes would not be reflected in the metrics until a significant amount of time has passed since they were made.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This same function is called at
sentry/src/sentry/tasks/low_priority_symbolication.py
Line 39 in 0513d7f
That is at least what I understand, do I have something wrong?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_record_metrics
is called for every project that is eligible for the LPQ, so it isn't necessarily being called for every project.It is being invoked in places where the automatic list is being mutated:
The recorded metrics will lag behind and be 10 seconds late if we don't update them in the latter scenario. The former (scan removal) only performs what is essentially pruning work, and a bulk of the actual updates are performed in the individual project assessments.
Again, if we skip recording after those are completed, all metrics will lag behind by 10s. Changes to manual kill switches will also potentially take up to 10s to be reflected in metrics. Do we want to make this tradeoff for the sake of avoiding the work in emitting duplicate metrics?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.