Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Cache the hogql parsing part of setting up the channel type fields #27768

Merged
merged 4 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions posthog/hogql/database/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,14 +328,14 @@ def create_hogql_database(
)
cast(LazyJoin, raw_replay_events.fields["events"]).join_table = events

with timings.measure("initial_domain_type"):
database.persons.fields["$virt_initial_referring_domain_type"] = create_initial_domain_type(
"$virt_initial_referring_domain_type"
)
with timings.measure("initial_channel_type"):
database.persons.fields["$virt_initial_channel_type"] = create_initial_channel_type(
"$virt_initial_channel_type", modifiers.customChannelTypeRules
)
with timings.measure("initial_domain_type"):
database.persons.fields["$virt_initial_referring_domain_type"] = create_initial_domain_type(
"$virt_initial_referring_domain_type", timings=timings
)
with timings.measure("initial_channel_type"):
database.persons.fields["$virt_initial_channel_type"] = create_initial_channel_type(
"$virt_initial_channel_type", modifiers.customChannelTypeRules, timings=timings
)

with timings.measure("group_type_mapping"):
for mapping in GroupTypeMapping.objects.filter(project_id=team.project_id):
Expand Down
233 changes: 134 additions & 99 deletions posthog/hogql/database/schema/channel_type.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from dataclasses import dataclass
from functools import cache
from typing import Optional, Union


from posthog.hogql import ast
from posthog.hogql.database.models import ExpressionField
from posthog.hogql.parser import parse_expr
from posthog.hogql.placeholders import replace_placeholders
from posthog.hogql.timings import HogQLTimings
from posthog.schema import (
CustomChannelRule,
CustomChannelOperator,
Expand Down Expand Up @@ -41,17 +45,17 @@ class ChannelTypeExprs:
gad_source: ast.Expr


def create_initial_domain_type(name: str):
def create_initial_domain_type(name: str, timings: Optional[HogQLTimings] = None):
if timings is None:
timings = HogQLTimings()

with timings.measure("initial_domain_type_expr"):
expr = _initial_domain_type_expr()

return ExpressionField(
name=name,
expr=parse_expr(
"""
if(
{referring_domain} = '$direct',
'$direct',
hogql_lookupDomainType({referring_domain})
)
""",
expr=replace_placeholders(
expr,
{
"referring_domain": ast.Call(
name="toString", args=[ast.Field(chain=["properties", "$initial_referring_domain"])]
Expand All @@ -61,7 +65,22 @@ def create_initial_domain_type(name: str):
)


def create_initial_channel_type(name: str, custom_rules: Optional[list[CustomChannelRule]] = None):
@cache
def _initial_domain_type_expr():
return parse_expr(
"""
if(
{referring_domain} = '$direct',
'$direct',
hogql_lookupDomainType({referring_domain})
)
"""
)


def create_initial_channel_type(
name: str, custom_rules: Optional[list[CustomChannelRule]] = None, timings: Optional[HogQLTimings] = None
):
return ExpressionField(
name=name,
expr=create_channel_type_expr(
Expand Down Expand Up @@ -89,6 +108,7 @@ def create_initial_channel_type(name: str, custom_rules: Optional[list[CustomCha
gad_source=ast.Call(name="toString", args=[ast.Field(chain=["properties", "$initial_gad_source"])]),
),
custom_rules=custom_rules,
timings=timings,
),
)

Expand Down Expand Up @@ -203,109 +223,124 @@ def custom_rule_to_expr(custom_rule: CustomChannelRule, source_exprs: ChannelTyp


def create_channel_type_expr(
custom_rules: Optional[list[CustomChannelRule]], source_exprs: ChannelTypeExprs
custom_rules: Optional[list[CustomChannelRule]],
source_exprs: ChannelTypeExprs,
timings: Optional[HogQLTimings] = None,
) -> ast.Expr:
custom_rule_expr: Optional[ast.Expr] = None
if custom_rules:
if_args = []
for rule in custom_rules:
if_args.append(custom_rule_to_expr(rule, source_exprs))
if_args.append(ast.Constant(value=rule.channel_type))
if_args.append(ast.Constant(value=None))
custom_rule_expr = ast.Call(name="multiIf", args=if_args)
if timings is None:
timings = HogQLTimings()

with timings.measure("custom_channel_rules"):
custom_rule_expr: Optional[ast.Expr] = None
if custom_rules:
if_args = []
for rule in custom_rules:
if_args.append(custom_rule_to_expr(rule, source_exprs))
if_args.append(ast.Constant(value=rule.channel_type))
if_args.append(ast.Constant(value=None))
custom_rule_expr = ast.Call(name="multiIf", args=if_args)

with timings.measure("default_channel_rules_parse"):
builtin_rules_expr = _initial_default_channel_rules_expr()
with timings.measure("default_channel_rules_replace"):
builtin_rules = replace_placeholders(
builtin_rules_expr,
placeholders={
"campaign": wrap_with_lower(wrap_with_null_if_empty(source_exprs.campaign)),
"medium": wrap_with_lower(wrap_with_null_if_empty(source_exprs.medium)),
"source": wrap_with_lower(wrap_with_null_if_empty(source_exprs.source)),
"referring_domain": source_exprs.referring_domain,
"has_gclid": source_exprs.has_gclid,
"has_fbclid": source_exprs.has_fbclid,
"gad_source": wrap_with_null_if_empty(source_exprs.gad_source),
},
)
if custom_rule_expr:
return ast.Call(
name="coalesce",
args=[custom_rule_expr, builtin_rules],
)
else:
return builtin_rules


@cache
def _initial_default_channel_rules_expr():
# This logic is referenced in our docs https://posthog.com/docs/data/channel-type, be sure to update both if you
# update either.
builtin_rules = parse_expr(
return parse_expr(
"""
multiIf(
match({campaign}, 'cross-network'),
'Cross Network',

(
{medium} IN ('cpc', 'cpm', 'cpv', 'cpa', 'ppc', 'retargeting') OR
startsWith({medium}, 'paid') OR
{has_gclid} OR
{gad_source} IS NOT NULL
),
coalesce(
hogql_lookupPaidSourceType({source}),
if(
match({campaign}, '^(.*(([^a-df-z]|^)shop|shopping).*)$'),
'Paid Shopping',
NULL
),
hogql_lookupPaidMediumType({medium}),
hogql_lookupPaidSourceType({referring_domain}),
multiIf (
{gad_source} = '1',
'Paid Search',
multiIf(
match({campaign}, 'cross-network'),
'Cross Network',

(
{medium} IN ('cpc', 'cpm', 'cpv', 'cpa', 'ppc', 'retargeting') OR
startsWith({medium}, 'paid') OR
{has_gclid} OR
{gad_source} IS NOT NULL
),
coalesce(
hogql_lookupPaidSourceType({source}),
if(
match({campaign}, '^(.*(([^a-df-z]|^)shop|shopping).*)$'),
'Paid Shopping',
NULL
),
hogql_lookupPaidMediumType({medium}),
hogql_lookupPaidSourceType({referring_domain}),
multiIf (
{gad_source} = '1',
'Paid Search',

match({campaign}, '^(.*video.*)$'),
'Paid Video',
match({campaign}, '^(.*video.*)$'),
'Paid Video',

{has_fbclid},
'Paid Social',
{has_fbclid},
'Paid Social',

'Paid Unknown'
)
),

(
{referring_domain} = '$direct'
AND ({medium} IS NULL)
AND ({source} IS NULL OR {source} IN ('(direct)', 'direct', '$direct'))
AND NOT {has_fbclid}
),
'Direct',

coalesce(
hogql_lookupOrganicSourceType({source}),
if(
match({campaign}, '^(.*(([^a-df-z]|^)shop|shopping).*)$'),
'Organic Shopping',
NULL
),
hogql_lookupOrganicMediumType({medium}),
hogql_lookupOrganicSourceType({referring_domain}),
multiIf(
match({campaign}, '^(.*video.*)$'),
'Organic Video',
'Paid Unknown'
)
),

match({medium}, 'push$'),
'Push',
(
{referring_domain} = '$direct'
AND ({medium} IS NULL)
AND ({source} IS NULL OR {source} IN ('(direct)', 'direct', '$direct'))
AND NOT {has_fbclid}
),
'Direct',

{has_fbclid},
'Organic Social',
coalesce(
hogql_lookupOrganicSourceType({source}),
if(
match({campaign}, '^(.*(([^a-df-z]|^)shop|shopping).*)$'),
'Organic Shopping',
NULL
),
hogql_lookupOrganicMediumType({medium}),
hogql_lookupOrganicSourceType({referring_domain}),
multiIf(
match({campaign}, '^(.*video.*)$'),
'Organic Video',

{referring_domain} == '$direct',
'Direct',
match({medium}, 'push$'),
'Push',

{referring_domain} IS NOT NULL,
'Referral',
{has_fbclid},
'Organic Social',

'Unknown'
)
)
)""",
start=None,
placeholders={
"campaign": wrap_with_lower(wrap_with_null_if_empty(source_exprs.campaign)),
"medium": wrap_with_lower(wrap_with_null_if_empty(source_exprs.medium)),
"source": wrap_with_lower(wrap_with_null_if_empty(source_exprs.source)),
"referring_domain": source_exprs.referring_domain,
"has_gclid": source_exprs.has_gclid,
"has_fbclid": source_exprs.has_fbclid,
"gad_source": wrap_with_null_if_empty(source_exprs.gad_source),
},
{referring_domain} == '$direct',
'Direct',

{referring_domain} IS NOT NULL,
'Referral',

'Unknown'
)
)
)"""
)
if custom_rule_expr:
return ast.Call(
name="coalesce",
args=[custom_rule_expr, builtin_rules],
)
else:
return builtin_rules


def wrap_with_null_if_empty(expr: ast.Expr) -> ast.Expr:
Expand Down
1 change: 1 addition & 0 deletions posthog/hogql/database/schema/sessions_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ def arg_max_merge_field(field_name: str) -> ast.Call:
),
gad_source=aggregate_fields["$entry_gad_source"],
),
timings=context.timings,
)

# aliases for people reverting from v2 to v1
Expand Down
1 change: 1 addition & 0 deletions posthog/hogql/database/schema/sessions_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ def arg_max_merge_field(field_name: str) -> ast.Call:
),
gad_source=aggregate_fields["$entry_gad_source"],
),
timings=context.timings,
)
# some aliases for people upgrading from v1 to v2
aggregate_fields["$exit_current_url"] = aggregate_fields["$end_current_url"]
Expand Down
Loading