Skip to content

Commit

Permalink
ref(grouping): Add get_hash_values helper (#63731)
Browse files Browse the repository at this point in the history
This pulls all of the grouping logic calculating hashes out of `save_error_events` into a new helper, `get_hash_values`, both for organization/encapsulation purposes and to make it easier to reason about code changes coming in future PRs.
  • Loading branch information
lobsterkatie authored and snigdhas committed Jan 26, 2024
1 parent b24b01e commit 1010f72
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 95 deletions.
87 changes: 3 additions & 84 deletions src/sentry/event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,10 @@
from sentry.eventtypes import EventType
from sentry.eventtypes.transaction import TransactionEvent
from sentry.exceptions import HashDiscarded
from sentry.grouping.api import (
GroupingConfig,
SecondaryGroupingConfigLoader,
get_grouping_config_dict_for_event_data,
get_grouping_config_dict_for_project,
)
from sentry.grouping.api import GroupingConfig, get_grouping_config_dict_for_project
from sentry.grouping.ingest import (
calculate_primary_hash,
calculate_secondary_hash,
find_existing_grouphash,
maybe_run_background_grouping,
should_run_secondary_grouping,
get_hash_values,
update_grouping_config_if_needed,
)
from sentry.grouping.result import CalculatedHashes
Expand Down Expand Up @@ -517,80 +509,7 @@ def save_error_events(
_derive_plugin_tags_many(jobs, projects)
_derive_interface_tags_many(jobs)

# Background grouping is a way for us to get performance metrics for a new config without
# having it actually affect how events are grouped. So as to not slow down overall
# ingestion too much, it only runs on a fraction of events.
maybe_run_background_grouping(project, job)

secondary_hashes = None

if should_run_secondary_grouping(project):
with metrics.timer("event_manager.secondary_grouping", tags=metric_tags):
secondary_grouping_config = SecondaryGroupingConfigLoader().get_config_dict(project)
secondary_hashes = calculate_secondary_hash(project, job, secondary_grouping_config)

with metrics.timer("event_manager.load_grouping_config"):
# At this point we want to normalize the in_app values in case the
# clients did not set this appropriately so far.
if is_reprocessed:
# The customer might have changed grouping enhancements since
# the event was ingested -> make sure we get the fresh one for reprocessing.
grouping_config = get_grouping_config_dict_for_project(project)
# Write back grouping config because it might have changed since the
# event was ingested.
# NOTE: We could do this unconditionally (regardless of `is_processed`).
job["data"]["grouping_config"] = grouping_config
else:
grouping_config = get_grouping_config_dict_for_event_data(
job["event"].data.data, project
)

with sentry_sdk.start_span(
op="event_manager",
description="event_manager.save.calculate_event_grouping",
), metrics.timer("event_manager.calculate_event_grouping", tags=metric_tags):
primary_hashes = calculate_primary_hash(project, job, grouping_config)

if secondary_hashes:
tags = {
"primary_config": grouping_config["id"],
"secondary_config": secondary_grouping_config["id"],
}
current_values = primary_hashes.hashes
secondary_values = secondary_hashes.hashes
hashes_match = current_values == secondary_values

if hashes_match:
tags["result"] = "no change"
else:
shared_hashes = set(current_values) & set(secondary_values)
if len(shared_hashes) > 0:
tags["result"] = "partial change"
else:
tags["result"] = "full change"

metrics.incr("grouping.hash_comparison", tags=tags)

# Track the total number of grouping calculations done overall, so we can divide by the
# count to get an average number of calculations per event
metrics.incr("grouping.hashes_calculated", amount=2 if secondary_hashes else 1)

all_hashes = CalculatedHashes(
hashes=list(primary_hashes.hashes)
+ list(secondary_hashes and secondary_hashes.hashes or []),
hierarchical_hashes=(
list(primary_hashes.hierarchical_hashes)
+ list(secondary_hashes and secondary_hashes.hierarchical_hashes or [])
),
tree_labels=(
primary_hashes.tree_labels
or (secondary_hashes and secondary_hashes.tree_labels)
or []
),
)

if all_hashes.tree_labels:
job["finest_tree_label"] = all_hashes.finest_tree_label
primary_hashes, secondary_hashes, all_hashes = get_hash_values(project, job, metric_tags)

# Because this logic is not complex enough we want to special case the situation where we
# migrate from a hierarchical hash to a non hierarchical hash. The reason being that
Expand Down
92 changes: 88 additions & 4 deletions src/sentry/grouping/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
BackgroundGroupingConfigLoader,
GroupingConfig,
GroupingConfigNotFound,
SecondaryGroupingConfigLoader,
apply_server_fingerprinting,
detect_synthetic_exception,
get_fingerprinting_config_for_project,
get_grouping_config_dict_for_event_data,
get_grouping_config_dict_for_project,
load_grouping_config,
)
Expand All @@ -26,6 +28,7 @@
from sentry.models.grouphash import GroupHash
from sentry.models.project import Project
from sentry.projectoptions.defaults import BETA_GROUPING_CONFIG, DEFAULT_GROUPING_CONFIG
from sentry.reprocessing2 import is_reprocessed_event
from sentry.utils import metrics
from sentry.utils.metrics import MutableTags
from sentry.utils.tag_normalization import normalized_sdk_tag_from_event
Expand Down Expand Up @@ -143,7 +146,7 @@ def _calculate_event_grouping(
return hashes


def maybe_run_background_grouping(project: Project, job: Job) -> None:
def _maybe_run_background_grouping(project: Project, job: Job) -> None:
"""
Optionally run a fraction of events with an experimental grouping config.
Expand Down Expand Up @@ -173,14 +176,14 @@ def _calculate_background_grouping(
return _calculate_event_grouping(project, event, config)


def should_run_secondary_grouping(project: Project) -> bool:
def _should_run_secondary_grouping(project: Project) -> bool:
secondary_grouping_config = project.get_option("sentry:secondary_grouping_config")
secondary_grouping_expiry = project.get_option("sentry:secondary_grouping_expiry")

return secondary_grouping_config and (secondary_grouping_expiry or 0) >= time.time()


def calculate_secondary_hash(
def _calculate_secondary_hash(
project: Project, job: Job, secondary_grouping_config: GroupingConfig
) -> None | CalculatedHashes:
"""Calculate secondary hash for event using a fallback grouping config for a period of time.
Expand All @@ -206,7 +209,7 @@ def calculate_secondary_hash(
return secondary_hashes


def calculate_primary_hash(
def _calculate_primary_hash(
project: Project, job: Job, grouping_config: GroupingConfig
) -> CalculatedHashes:
"""
Expand Down Expand Up @@ -295,3 +298,84 @@ def find_existing_grouphash(
)

return None, root_hierarchical_hash


def get_hash_values(
project: Project,
job: Job,
metric_tags: MutableTags,
) -> tuple[CalculatedHashes, CalculatedHashes | None, CalculatedHashes]:
# Background grouping is a way for us to get performance metrics for a new
# config without having it actually affect on how events are grouped. It runs
# either before or after the main grouping logic, depending on the option value.
_maybe_run_background_grouping(project, job)

secondary_hashes = None

if _should_run_secondary_grouping(project):
with metrics.timer("event_manager.secondary_grouping", tags=metric_tags):
secondary_grouping_config = SecondaryGroupingConfigLoader().get_config_dict(project)
secondary_hashes = _calculate_secondary_hash(project, job, secondary_grouping_config)

with metrics.timer("event_manager.load_grouping_config"):
# At this point we want to normalize the in_app values in case the
# clients did not set this appropriately so far.
if is_reprocessed_event(job["data"]):
# The customer might have changed grouping enhancements since
# the event was ingested -> make sure we get the fresh one for reprocessing.
grouping_config = get_grouping_config_dict_for_project(project)
# Write back grouping config because it might have changed since the
# event was ingested.
# NOTE: We could do this unconditionally (regardless of `is_processed`).
job["data"]["grouping_config"] = grouping_config
else:
grouping_config = get_grouping_config_dict_for_event_data(
job["event"].data.data, project
)

with sentry_sdk.start_span(
op="event_manager",
description="event_manager.save.calculate_event_grouping",
), metrics.timer("event_manager.calculate_event_grouping", tags=metric_tags):
primary_hashes = _calculate_primary_hash(project, job, grouping_config)

if secondary_hashes:
tags = {
"primary_config": grouping_config["id"],
"secondary_config": secondary_grouping_config["id"],
}
current_values = primary_hashes.hashes
secondary_values = secondary_hashes.hashes
hashes_match = current_values == secondary_values

if hashes_match:
tags["result"] = "no change"
else:
shared_hashes = set(current_values) & set(secondary_values)
if len(shared_hashes) > 0:
tags["result"] = "partial change"
else:
tags["result"] = "full change"

metrics.incr("grouping.hash_comparison", tags=tags)

# Track the total number of grouping calculations done overall, so we can divide by the
# count to get an average number of calculations per event
metrics.incr("grouping.hashes_calculated", amount=2 if secondary_hashes else 1)

all_hashes = CalculatedHashes(
hashes=list(primary_hashes.hashes)
+ list(secondary_hashes and secondary_hashes.hashes or []),
hierarchical_hashes=(
list(primary_hashes.hierarchical_hashes)
+ list(secondary_hashes and secondary_hashes.hierarchical_hashes or [])
),
tree_labels=(
primary_hashes.tree_labels or (secondary_hashes and secondary_hashes.tree_labels) or []
),
)

if all_hashes.tree_labels:
job["finest_tree_label"] = all_hashes.finest_tree_label

return (primary_hashes, secondary_hashes, all_hashes)
6 changes: 3 additions & 3 deletions tests/sentry/event_manager/test_event_manager_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_records_num_calculations(self, mock_metrics_incr: MagicMock):
assert hashes_calculated_calls[1].kwargs["amount"] == 2

@mock.patch("sentry.event_manager.metrics.incr")
@mock.patch("sentry.event_manager.should_run_secondary_grouping", return_value=True)
@mock.patch("sentry.grouping.ingest._should_run_secondary_grouping", return_value=True)
def test_records_hash_comparison(self, _, mock_metrics_incr: MagicMock):
project = self.project
project.update_option("sentry:grouping_config", "newstyle:2023-01-11")
Expand All @@ -195,13 +195,13 @@ def test_records_hash_comparison(self, _, mock_metrics_incr: MagicMock):

for primary_hashes, secondary_hashes, expected_tag in cases:
with mock.patch(
"sentry.event_manager.calculate_primary_hash",
"sentry.grouping.ingest._calculate_primary_hash",
return_value=CalculatedHashes(
hashes=primary_hashes, hierarchical_hashes=[], tree_labels=[]
),
):
with mock.patch(
"sentry.event_manager.calculate_secondary_hash",
"sentry.grouping.ingest._calculate_secondary_hash",
return_value=CalculatedHashes(
hashes=secondary_hashes, hierarchical_hashes=[], tree_labels=[]
),
Expand Down
6 changes: 3 additions & 3 deletions tests/sentry/grouping/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sentry.grouping.ingest import (
_calculate_background_grouping,
_calculate_event_grouping,
calculate_secondary_hash,
_calculate_secondary_hash,
)
from sentry.models.group import Group
from sentry.testutils.cases import TestCase
Expand Down Expand Up @@ -134,7 +134,7 @@ def test_applies_secondary_grouping(self):
assert event3.group_id == event2.group_id

@patch("sentry_sdk.capture_exception")
@patch("sentry.event_manager.calculate_secondary_hash", wraps=calculate_secondary_hash)
@patch("sentry.grouping.ingest._calculate_secondary_hash", wraps=_calculate_secondary_hash)
def test_handles_errors_with_secondary_grouping(
self,
mock_calculate_secondary_hash: MagicMock,
Expand All @@ -144,7 +144,7 @@ def test_handles_errors_with_secondary_grouping(
secondary_grouping_config = "legacy:2019-03-12"

def mock_calculate_event_grouping(project, event, grouping_config):
# We only want `_calculate_event_grouping` to error inside of `calculate_secondary_hash`,
# We only want `_calculate_event_grouping` to error inside of `_calculate_secondary_hash`,
# not anywhere else it's called
if grouping_config["id"] == secondary_grouping_config:
raise secondary_grouping_error
Expand Down
2 changes: 1 addition & 1 deletion tests/sentry/tasks/test_reprocessing2.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ def event_preprocessor(data):
original_issue_id = event1.group.id

with mock.patch(
"sentry.event_manager.get_grouping_config_dict_for_project",
"sentry.grouping.ingest.get_grouping_config_dict_for_project",
return_value={
"id": DEFAULT_GROUPING_CONFIG,
"enhancements": Enhancements.from_config_string(
Expand Down

0 comments on commit 1010f72

Please sign in to comment.