From 01287407e6b4791b0cd3f1621dfebec24d651bfc Mon Sep 17 00:00:00 2001 From: William Mak Date: Wed, 3 Nov 2021 16:28:54 -0400 Subject: [PATCH] feat(discover): Add snql support to the events-facets endpoint (#29674) - This also adds support for turbo + sample rate to the QueryBuilder - This updates the tests so feature modifying for snql is easier - Otherwise mostly copy paste from the existing code --- .../endpoints/organization_events_facets.py | 5 +- src/sentry/search/events/builder.py | 9 +- src/sentry/snuba/discover.py | 137 +++++++++++++++++- src/sentry/snuba/events.py | 4 +- tests/sentry/search/events/test_builder.py | 30 ++++ tests/sentry/snuba/test_discover.py | 26 ++-- .../test_organization_events_facets.py | 68 +++++---- 7 files changed, 237 insertions(+), 42 deletions(-) diff --git a/src/sentry/api/endpoints/organization_events_facets.py b/src/sentry/api/endpoints/organization_events_facets.py index d6f77136b23734..8f4e182d1b9796 100644 --- a/src/sentry/api/endpoints/organization_events_facets.py +++ b/src/sentry/api/endpoints/organization_events_facets.py @@ -3,7 +3,7 @@ import sentry_sdk from rest_framework.response import Response -from sentry import tagstore +from sentry import features, tagstore from sentry.api.bases import NoProjects, OrganizationEventsV2EndpointBase from sentry.snuba import discover @@ -24,6 +24,9 @@ def get(self, request, organization): query=request.GET.get("query"), params=params, referrer="api.organization-events-facets.top-tags", + use_snql=features.has( + "organizations:discover-use-snql", organization, actor=request.user + ), ) with sentry_sdk.start_span(op="discover.endpoint", description="populate_results") as span: diff --git a/src/sentry/search/events/builder.py b/src/sentry/search/events/builder.py index 672c38200a1a19..76a82fc9a90fdd 100644 --- a/src/sentry/search/events/builder.py +++ b/src/sentry/search/events/builder.py @@ -4,7 +4,7 @@ from snuba_sdk.column import Column from snuba_sdk.conditions import And, Condition, Op, Or from snuba_sdk.entity import Entity -from snuba_sdk.expressions import Granularity, Limit, Offset +from snuba_sdk.expressions import Granularity, Limit, Offset, Turbo from snuba_sdk.function import CurriedFunction, Function from snuba_sdk.orderby import Direction, LimitBy, OrderBy from snuba_sdk.query import Query @@ -35,6 +35,8 @@ def __init__( limit: Optional[int] = 50, offset: Optional[int] = 0, limitby: Optional[Tuple[str, int]] = None, + turbo: Optional[bool] = False, + sample_rate: Optional[float] = None, ): super().__init__(dataset, params, auto_fields, functions_acl) @@ -44,6 +46,8 @@ def __init__( self.offset = None if offset is None else Offset(offset) self.limitby = self.resolve_limitby(limitby) + self.turbo = Turbo(turbo) + self.sample_rate = sample_rate self.where, self.having = self.resolve_conditions( query, use_aggregate_conditions=use_aggregate_conditions @@ -130,7 +134,7 @@ def get_snql_query(self) -> Query: return Query( dataset=self.dataset.value, - match=Entity(self.dataset.value), + match=Entity(self.dataset.value, sample=self.sample_rate), select=self.columns, array_join=self.array_join, where=self.where, @@ -140,6 +144,7 @@ def get_snql_query(self) -> Query: limit=self.limit, offset=self.offset, limitby=self.limitby, + turbo=self.turbo, ) diff --git a/src/sentry/snuba/discover.py b/src/sentry/snuba/discover.py index 9abc8b1a6bf83c..9ddbc24f8609ef 100644 --- a/src/sentry/snuba/discover.py +++ b/src/sentry/snuba/discover.py @@ -23,6 +23,7 @@ resolve_field_list, ) from sentry.search.events.filter import get_filter +from sentry.search.events.types import ParamsType from sentry.tagstore.base import TOP_VALUES_DEFAULT_LIMIT from sentry.utils.compat import filter from sentry.utils.dates import to_timestamp @@ -73,6 +74,7 @@ resolve_discover_column = resolve_column(Dataset.Discover) OTHER_KEY = "Other" +TOP_KEYS_DEFAULT_LIMIT = 10 def is_real_column(col): @@ -1019,7 +1021,13 @@ def get_id(result): return result[1] -def get_facets(query, params, limit=10, referrer=None): +def get_facets( + query: str, + params: ParamsType, + referrer: str, + limit: Optional[int] = TOP_KEYS_DEFAULT_LIMIT, + use_snql: Optional[bool] = False, +): """ High-level API for getting 'facet map' results. @@ -1029,11 +1037,136 @@ def get_facets(query, params, limit=10, referrer=None): query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment + referrer (str) A referrer string to help locate the origin of this query. limit (int) The number of records to fetch. - referrer (str|None) A referrer string to help locate the origin of this query. Returns Sequence[FacetResult] """ + sentry_sdk.set_tag("discover.use_snql", use_snql) + if use_snql: + # temporarily add snql to referrer + referrer = f"{referrer}.wip-snql" + sample = len(params["project_id"]) > 2 + + with sentry_sdk.start_span(op="discover.discover", description="facets.frequent_tags"): + key_name_builder = QueryBuilder( + Dataset.Discover, + params, + query=query, + selected_columns=["tags_key", "count()"], + orderby=["-count()", "tags_key"], + limit=limit, + turbo=sample, + ) + key_names = raw_snql_query(key_name_builder.get_snql_query(), referrer=referrer) + # Sampling keys for multi-project results as we don't need accuracy + # with that much data. + top_tags = [r["tags_key"] for r in key_names["data"]] + if not top_tags: + return [] + + # TODO: Make the sampling rate scale based on the result size and scaling factor in + # sentry.options. To test the lowest acceptable sampling rate, we use 0.1 which + # is equivalent to turbo. We don't use turbo though as we need to re-scale data, and + # using turbo could cause results to be wrong if the value of turbo is changed in snuba. + sample_rate = 0.1 if (key_names["data"][0]["count"] > 10000) else None + # Rescale the results if we're sampling + multiplier = 1 / sample_rate if sample_rate is not None else 1 + + fetch_projects = False + if len(params.get("project_id", [])) > 1: + if len(top_tags) == limit: + top_tags.pop() + fetch_projects = True + + results = [] + if fetch_projects: + with sentry_sdk.start_span(op="discover.discover", description="facets.projects"): + project_value_builder = QueryBuilder( + Dataset.Discover, + params, + query=query, + selected_columns=["count()", "project_id"], + orderby=["-count()"], + # Ensures Snuba will not apply FINAL + turbo=sample_rate is not None, + sample_rate=sample_rate, + ) + project_values = raw_snql_query( + project_value_builder.get_snql_query(), referrer=referrer + ) + results.extend( + [ + FacetResult("project", r["project_id"], int(r["count"]) * multiplier) + for r in project_values["data"] + ] + ) + + # Get tag counts for our top tags. Fetching them individually + # allows snuba to leverage promoted tags better and enables us to get + # the value count we want. + individual_tags = [] + aggregate_tags = [] + for i, tag in enumerate(top_tags): + if tag == "environment": + # Add here tags that you want to be individual + individual_tags.append(tag) + elif i >= len(top_tags) - TOP_KEYS_DEFAULT_LIMIT: + aggregate_tags.append(tag) + else: + individual_tags.append(tag) + + with sentry_sdk.start_span( + op="discover.discover", description="facets.individual_tags" + ) as span: + span.set_data("tag_count", len(individual_tags)) + for tag_name in individual_tags: + tag = f"tags[{tag_name}]" + tag_value_builder = QueryBuilder( + Dataset.Discover, + params, + query=query, + selected_columns=["count()", tag], + orderby=["-count()"], + limit=TOP_VALUES_DEFAULT_LIMIT, + # Ensures Snuba will not apply FINAL + turbo=sample_rate is not None, + sample_rate=sample_rate, + ) + tag_values = raw_snql_query(tag_value_builder.get_snql_query(), referrer=referrer) + results.extend( + [ + FacetResult(tag_name, r[tag], int(r["count"]) * multiplier) + for r in tag_values["data"] + ] + ) + + if aggregate_tags: + with sentry_sdk.start_span(op="discover.discover", description="facets.aggregate_tags"): + aggregate_value_builder = QueryBuilder( + Dataset.Discover, + params, + query=(query if query is not None else "") + + f" tags_key:[{','.join(aggregate_tags)}]", + selected_columns=["count()", "tags_key", "tags_value"], + orderby=["tags_key", "-count()"], + limitby=("tags_key", TOP_VALUES_DEFAULT_LIMIT), + # Ensures Snuba will not apply FINAL + turbo=sample_rate is not None, + sample_rate=sample_rate, + ) + aggregate_values = raw_snql_query( + aggregate_value_builder.get_snql_query(), referrer=referrer + ) + results.extend( + [ + FacetResult(r["tags_key"], r["tags_value"], int(r["count"]) * multiplier) + for r in aggregate_values["data"] + ] + ) + + return results + with sentry_sdk.start_span( op="discover.discover", description="facets.filter_transform" ) as span: diff --git a/src/sentry/snuba/events.py b/src/sentry/snuba/events.py index c7fb0dc040f20d..14e42e60dc3883 100644 --- a/src/sentry/snuba/events.py +++ b/src/sentry/snuba/events.py @@ -32,7 +32,9 @@ class Columns(Enum): TAGS_KEY = Column("events.tags.key", "tags.key", "tags.key", "tags.key", "tags.key") TAGS_VALUE = Column("events.tags.value", "tags.value", "tags.value", "tags.value", "tags.value") TAGS_KEYS = Column("events.tags_key", "tags_key", "tags_key", "tags_key", "tags_key") - TAGS_VALUES = Column("events.tags_value", "tags_value", "tags_value", None, "tags_value") + TAGS_VALUES = Column( + "events.tags_value", "tags_value", "tags_value", "tags_value", "tags_value" + ) TRANSACTION = Column( "events.transaction", "transaction", "transaction_name", "transaction", "transaction" ) diff --git a/tests/sentry/search/events/test_builder.py b/tests/sentry/search/events/test_builder.py index e27c4ee9df1422..9d29437f32326a 100644 --- a/tests/sentry/search/events/test_builder.py +++ b/tests/sentry/search/events/test_builder.py @@ -455,3 +455,33 @@ def test_array_join_clause(self): ) assert query.array_join == Column("spans.op") query.get_snql_query().validate() + + def test_sample_rate(self): + query = QueryBuilder( + Dataset.Discover, + self.params, + "", + selected_columns=[ + "count()", + ], + sample_rate=0.1, + ) + assert query.sample_rate == 0.1 + snql_query = query.get_snql_query() + snql_query.validate() + assert snql_query.match.sample == 0.1 + + def test_turbo(self): + query = QueryBuilder( + Dataset.Discover, + self.params, + "", + selected_columns=[ + "count()", + ], + turbo=True, + ) + assert query.turbo.value + snql_query = query.get_snql_query() + snql_query.validate() + assert snql_query.turbo.value diff --git a/tests/sentry/snuba/test_discover.py b/tests/sentry/snuba/test_discover.py index 9052ba40f59dc0..8824d5e9fb5a4c 100644 --- a/tests/sentry/snuba/test_discover.py +++ b/tests/sentry/snuba/test_discover.py @@ -6212,12 +6212,16 @@ def setUp(self): def test_invalid_query(self): with pytest.raises(InvalidSearchQuery): discover.get_facets( - "\n", {"project_id": [self.project.id], "end": self.min_ago, "start": self.day_ago} + "\n", + {"project_id": [self.project.id], "end": self.min_ago, "start": self.day_ago}, + "testing.get-facets-test", ) def test_no_results(self): results = discover.get_facets( - "", {"project_id": [self.project.id], "end": self.min_ago, "start": self.day_ago} + "", + {"project_id": [self.project.id], "end": self.min_ago, "start": self.day_ago}, + "testing.get-facets-test", ) assert results == [] @@ -6241,7 +6245,7 @@ def test_single_project(self): project_id=self.project.id, ) params = {"project_id": [self.project.id], "start": self.day_ago, "end": self.min_ago} - result = discover.get_facets("", params) + result = discover.get_facets("", params, "testing.get-facets-test") assert len(result) == 5 assert {r.key for r in result} == {"color", "paying", "level"} assert {r.value for r in result} == {"red", "blue", "1", "0", "error"} @@ -6268,7 +6272,7 @@ def test_project_filter(self): project_id=other_project.id, ) params = {"project_id": [self.project.id], "start": self.day_ago, "end": self.min_ago} - result = discover.get_facets("", params) + result = discover.get_facets("", params, "testing.get-facets-test") keys = {r.key for r in result} assert keys == {"color", "level"} @@ -6278,7 +6282,7 @@ def test_project_filter(self): "start": self.day_ago, "end": self.min_ago, } - result = discover.get_facets("", params) + result = discover.get_facets("", params, "testing.get-facets-test") keys = {r.key for r in result} assert keys == {"level", "toy", "color", "project"} @@ -6297,7 +6301,7 @@ def test_environment_promoted_tag(self): project_id=self.project.id, ) params = {"project_id": [self.project.id], "start": self.day_ago, "end": self.min_ago} - result = discover.get_facets("", params) + result = discover.get_facets("", params, "testing.get-facets-test") keys = {r.key for r in result} assert keys == {"environment", "level"} assert {None, "prod", "staging"} == {f.value for f in result if f.key == "environment"} @@ -6323,12 +6327,12 @@ def test_query_string(self): project_id=self.project.id, ) params = {"project_id": [self.project.id], "start": self.day_ago, "end": self.min_ago} - result = discover.get_facets("bad", params) + result = discover.get_facets("bad", params, "testing.get-facets-test") keys = {r.key for r in result} assert "color" in keys assert "toy" not in keys - result = discover.get_facets("color:red", params) + result = discover.get_facets("color:red", params, "testing.get-facets-test") keys = {r.key for r in result} assert "color" in keys assert "toy" not in keys @@ -6353,12 +6357,12 @@ def test_query_string_with_aggregate_condition(self): project_id=self.project.id, ) params = {"project_id": [self.project.id], "start": self.day_ago, "end": self.min_ago} - result = discover.get_facets("bad", params) + result = discover.get_facets("bad", params, "testing.get-facets-test") keys = {r.key for r in result} assert "color" in keys assert "toy" not in keys - result = discover.get_facets("color:red p95():>1", params) + result = discover.get_facets("color:red p95():>1", params, "testing.get-facets-test") keys = {r.key for r in result} assert "color" in keys assert "toy" not in keys @@ -6383,7 +6387,7 @@ def test_date_params(self): project_id=self.project.id, ) params = {"project_id": [self.project.id], "start": self.day_ago, "end": self.min_ago} - result = discover.get_facets("", params) + result = discover.get_facets("", params, "testing.get-facets-test") keys = {r.key for r in result} assert "color" in keys assert "toy" not in keys diff --git a/tests/snuba/api/endpoints/test_organization_events_facets.py b/tests/snuba/api/endpoints/test_organization_events_facets.py index f14ecc0d2cfb01..a290575d4624a4 100644 --- a/tests/snuba/api/endpoints/test_organization_events_facets.py +++ b/tests/snuba/api/endpoints/test_organization_events_facets.py @@ -12,8 +12,6 @@ class OrganizationEventsFacetsEndpointTest(SnubaTestCase, APITestCase): - feature_list = ("organizations:discover-basic", "organizations:global-views") - def setUp(self): super().setUp() self.min_ago = before_now(minutes=1).replace(microsecond=0) @@ -26,6 +24,7 @@ def setUp(self): kwargs={"organization_slug": self.project.organization.slug}, ) self.min_ago_iso = iso_format(self.min_ago) + self.features = {"organizations:discover-basic": True, "organizations:global-views": True} def assert_facet(self, response, key, expected): actual = None @@ -39,9 +38,13 @@ def assert_facet(self, response, key, expected): assert sorted(expected, key=key) == sorted(actual["topValues"], key=key) def test_performance_view_feature(self): - with self.feature( - {"organizations:discover-basic": False, "organizations:performance-view": True} - ): + self.features.update( + { + "organizations:discover-basic": False, + "organizations:performance-view": True, + } + ) + with self.feature(self.features): response = self.client.get(self.url, data={"project": self.project.id}, format="json") assert response.status_code == 200, response.content @@ -72,7 +75,7 @@ def test_simple(self): project_id=self.project.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get(self.url, format="json") assert response.status_code == 200, response.content @@ -111,7 +114,7 @@ def test_with_message_query(self): project_id=self.project2.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get(self.url, {"query": "delet"}, format="json") assert response.status_code == 200, response.content @@ -150,7 +153,7 @@ def test_with_condition(self): project_id=self.project2.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get(self.url, {"query": "color:yellow"}, format="json") assert response.status_code == 200, response.content @@ -186,7 +189,7 @@ def test_with_conditional_filter(self): project_id=self.project2.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get( self.url, {"query": "color:yellow OR color:red"}, format="json" ) @@ -236,7 +239,7 @@ def test_start_end(self): project_id=self.project2.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get( self.url, {"start": iso_format(self.day_ago), "end": iso_format(self.min_ago)}, @@ -278,7 +281,7 @@ def test_excluded_tag(self): project_id=self.project.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get(self.url, format="json", data={"project": [self.project.id]}) assert response.status_code == 200, response.content @@ -325,7 +328,7 @@ def test_project_selected(self): project_id=self.project2.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get(self.url, {"project": [self.project.id]}, format="json") assert response.status_code == 200, response.content @@ -350,7 +353,7 @@ def test_project_filtered(self): project_id=self.project2.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get( self.url, {"query": f"project:{self.project.slug}"}, format="json" ) @@ -389,7 +392,7 @@ def test_project_key(self): project_id=self.project.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get(self.url, format="json") assert response.status_code == 200, response.content @@ -442,7 +445,7 @@ def test_project_key_with_project_tag(self): project_id=private_project2.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get(self.url, format="json") assert response.status_code == 200, response.content @@ -457,17 +460,17 @@ def test_malformed_query(self): self.store_event(data={"event_id": uuid4().hex}, project_id=self.project.id) self.store_event(data={"event_id": uuid4().hex}, project_id=self.project2.id) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get(self.url, format="json", data={"query": "\n\n\n\n"}) assert response.status_code == 400, response.content - assert response.data == { - "detail": "Parse error at '\n\n\n\n' (column 1). This is commonly caused by unmatched parentheses. Enclose any text in double quotes." - } + assert response.data["detail"].endswith( + "(column 1). This is commonly caused by unmatched parentheses. Enclose any text in double quotes." + ) @mock.patch("sentry.snuba.discover.raw_query") def test_handling_snuba_errors(self, mock_query): mock_query.side_effect = ParseError("test") - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get(self.url, format="json") assert response.status_code == 400, response.content @@ -500,7 +503,7 @@ def test_environment(self): project_id=self.project.id, ) - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get(self.url, format="json") assert response.status_code == 200, response.content @@ -511,7 +514,7 @@ def test_environment(self): ] self.assert_facet(response, "environment", expected) - with self.feature(self.feature_list): + with self.feature(self.features): # query by an environment response = self.client.get(self.url, {"environment": "staging"}, format="json") @@ -519,7 +522,7 @@ def test_environment(self): expected = [{"count": 1, "name": "staging", "value": "staging"}] self.assert_facet(response, "environment", expected) - with self.feature(self.feature_list): + with self.feature(self.features): # query by multiple environments response = self.client.get( self.url, {"environment": ["staging", "production"]}, format="json" @@ -533,7 +536,7 @@ def test_environment(self): ] self.assert_facet(response, "environment", expected) - with self.feature(self.feature_list): + with self.feature(self.features): # query by multiple environments, including the "no environment" environment response = self.client.get( self.url, {"environment": ["staging", "production", ""]}, format="json" @@ -548,7 +551,7 @@ def test_environment(self): def test_out_of_retention(self): with self.options({"system.event-retention-days": 10}): - with self.feature(self.feature_list): + with self.feature(self.features): response = self.client.get( self.url, format="json", @@ -591,3 +594,18 @@ def test_quantize_dates(self, mock_quantize): ) assert len(mock_quantize.mock_calls) == 2 + + +class OrganizationEventsFacetsEndpointTestWithSnql(OrganizationEventsFacetsEndpointTest): + def setUp(self): + super().setUp() + self.features["organizations:discover-use-snql"] = True + + # Separate test for now to keep the patching simpler + @mock.patch("sentry.snuba.discover.raw_snql_query") + def test_handling_snuba_errors(self, mock_query): + mock_query.side_effect = ParseError("test") + with self.feature(self.features): + response = self.client.get(self.url, format="json") + + assert response.status_code == 400, response.content