-
Notifications
You must be signed in to change notification settings - Fork 1.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Or property filtering API #8012
Changes from 7 commits
3f0c33e
aab6878
3895247
6ebdfa8
6dfc8bb
4f1c0b6
336a81d
f6bb201
18920b6
a0b07e2
362059e
d3055d2
ec7f530
a9950b1
83b1747
3de4eac
568b671
10af58b
5340638
46cdc8b
2c09c66
005c37b
be8f7da
544ea89
1dc7785
0c46395
cc6a64e
0c998dc
ffef8e7
c04192e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
Literal, | ||
Optional, | ||
Tuple, | ||
Union, | ||
cast, | ||
) | ||
|
||
|
@@ -40,10 +41,100 @@ | |
Property, | ||
PropertyIdentifier, | ||
PropertyName, | ||
PropertyOperatorType, | ||
) | ||
from posthog.models.team import Team | ||
from posthog.utils import is_valid_regex, relative_date_parse | ||
|
||
# Example: | ||
# {type: 'AND', properties: [ | ||
# {type: 'OR', properties: [A, B, C]}, | ||
# {type: 'OR', properties: [D, E, F]}, | ||
# G, # invalid? Can enforce this via UI: if groups, everything else needs to be a group too. | ||
# ]} | ||
|
||
# Example: | ||
# {type: 'AND', properties: [ | ||
# A, B, C, D | ||
# ]} | ||
|
||
|
||
# Property json is of the form: | ||
# { type: 'AND | OR', properties: List[Property] } | ||
# which is parsed and sent to this function -> | ||
class PropertyGroup: | ||
type: PropertyOperatorType | ||
properties: Union[List[Property], List["PropertyGroup"]] | ||
|
||
def __init__(self, type: PropertyOperatorType, properties: Union[List[Property], List["PropertyGroup"]]) -> None: | ||
self.type = type | ||
self.properties = properties | ||
|
||
|
||
def parse_prop_grouped_clauses( | ||
filter_group: PropertyGroup, | ||
prepend: str = "global", | ||
table_name: str = "", | ||
allow_denormalized_props: bool = True, | ||
has_person_id_joined: bool = True, | ||
person_properties_mode: PersonPropertiesMode = PersonPropertiesMode.USING_SUBQUERY, | ||
person_id_joined_alias: str = "person_id", | ||
group_properties_joined: bool = True, | ||
_top_level: bool = True, | ||
) -> Tuple[str, Dict]: | ||
# TODO: setup structure of new properties, and call parse_prop_clauses with right operator type | ||
|
||
group: Union[Property, PropertyGroup] | ||
|
||
if len(filter_group.properties) == 0: | ||
return "", {} | ||
|
||
if isinstance(filter_group.properties[0], PropertyGroup): | ||
group_clauses = [] | ||
final_params = {} | ||
group: PropertyGroup | ||
for idx, group in enumerate(filter_group.properties): | ||
clause, params = parse_prop_grouped_clauses( | ||
filter_group=group, | ||
prepend=f"{prepend}_{idx}", | ||
table_name=table_name, | ||
allow_denormalized_props=allow_denormalized_props, | ||
has_person_id_joined=has_person_id_joined, | ||
person_properties_mode=person_properties_mode, | ||
person_id_joined_alias=person_id_joined_alias, | ||
group_properties_joined=group_properties_joined, | ||
_top_level=False, | ||
) | ||
group_clauses.append(clause) | ||
final_params.update(params) | ||
|
||
_final = f"{filter_group.type} ".join(group_clauses) | ||
else: | ||
_final, final_params = parse_prop_clauses( | ||
filters=filter_group.properties, | ||
prepend=f"{prepend}", | ||
table_name=table_name, | ||
allow_denormalized_props=allow_denormalized_props, | ||
has_person_id_joined=has_person_id_joined, | ||
person_properties_mode=person_properties_mode, | ||
person_id_joined_alias=person_id_joined_alias, | ||
group_properties_joined=group_properties_joined, | ||
property_operator=filter_group.type, | ||
) | ||
if _top_level: | ||
final = f"AND ({_final})" | ||
else: | ||
final = f"({_final})" | ||
|
||
return final, final_params | ||
|
||
|
||
def is_property_group(group: Union[Property, "PropertyGroup"]): | ||
if isinstance(group, PropertyGroup): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Or even inline? |
||
return True | ||
else: | ||
return False | ||
|
||
|
||
def parse_prop_clauses( | ||
filters: List[Property], | ||
|
@@ -54,6 +145,7 @@ def parse_prop_clauses( | |
person_properties_mode: PersonPropertiesMode = PersonPropertiesMode.USING_SUBQUERY, | ||
person_id_joined_alias: str = "person_id", | ||
group_properties_joined: bool = True, | ||
property_operator: PropertyOperatorType = "AND", | ||
) -> Tuple[str, Dict]: | ||
final = [] | ||
params: Dict[str, Any] = {} | ||
|
@@ -65,13 +157,13 @@ def parse_prop_clauses( | |
try: | ||
cohort = Cohort.objects.get(pk=prop.value) | ||
except Cohort.DoesNotExist: | ||
final.append("AND 0 = 13") # If cohort doesn't exist, nothing can match | ||
final.append( | ||
f" {property_operator} 0 = 13" | ||
) # If cohort doesn't exist, nothing can match, unless an OR operator is used | ||
else: | ||
person_id_query, cohort_filter_params = format_filter_query(cohort, idx) | ||
params = {**params, **cohort_filter_params} | ||
final.append( | ||
"AND {table_name}distinct_id IN ({clause})".format(table_name=table_name, clause=person_id_query) | ||
) | ||
final.append(f" {property_operator} {table_name}distinct_id IN ({person_id_query})") | ||
elif prop.type == "person" and person_properties_mode != PersonPropertiesMode.EXCLUDE: | ||
# :TODO: Clean this up by using ClickhousePersonQuery over GET_DISTINCT_IDS_BY_PROPERTY_SQL to have access | ||
# to materialized columns | ||
|
@@ -83,17 +175,19 @@ def parse_prop_clauses( | |
"{}person".format(prepend), | ||
prop_var="person_props" if is_direct_query else "properties", | ||
allow_denormalized_props=allow_denormalized_props and is_direct_query, | ||
property_operator="AND", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Super tricky. This is a solo condition that's always pushed into a subquery so it needs to be AND. It's unrelated from the larger logic for the overall condition it's forming (addressed in |
||
) | ||
if is_direct_query: | ||
final.append(filter_query) | ||
params.update(filter_params) | ||
else: | ||
final.append( | ||
"AND {table_name}distinct_id IN ({filter_query})".format( | ||
" {property_operator} {table_name}distinct_id IN ({filter_query})".format( | ||
filter_query=GET_DISTINCT_IDS_BY_PROPERTY_SQL.format( | ||
filters=filter_query, GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS | ||
), | ||
table_name=table_name, | ||
property_operator=property_operator, | ||
) | ||
) | ||
params.update(filter_params) | ||
|
@@ -102,7 +196,7 @@ def parse_prop_clauses( | |
{prop.key: prop.value}, operator=prop.operator, prepend="{}_".format(idx) | ||
) | ||
if query: | ||
final.append(f" AND {query}") | ||
final.append(f" {property_operator} {query}") | ||
params.update(filter_params) | ||
elif prop.type == "event": | ||
filter_query, filter_params = prop_filter_json_extract( | ||
|
@@ -111,8 +205,8 @@ def parse_prop_clauses( | |
prepend, | ||
prop_var="{}properties".format(table_name), | ||
allow_denormalized_props=allow_denormalized_props, | ||
property_operator=property_operator, | ||
) | ||
|
||
final.append(f" {filter_query}") | ||
params.update(filter_params) | ||
elif prop.type == "group": | ||
|
@@ -123,6 +217,7 @@ def parse_prop_clauses( | |
prepend, | ||
prop_var=f"group_properties_{prop.group_type_index}", | ||
allow_denormalized_props=False, | ||
property_operator=property_operator, | ||
) | ||
final.append(filter_query) | ||
params.update(filter_params) | ||
|
@@ -135,7 +230,7 @@ def parse_prop_clauses( | |
groups_subquery = GET_GROUP_IDS_BY_PROPERTY_SQL.format( | ||
filters=filter_query, group_type_index_var=group_type_index_var | ||
) | ||
final.append(f"AND {table_name}$group_{prop.group_type_index} IN ({groups_subquery})") | ||
final.append(f" {property_operator} {table_name}$group_{prop.group_type_index} IN ({groups_subquery})") | ||
params.update(filter_params) | ||
params[group_type_index_var] = prop.group_type_index | ||
elif prop.type in ("static-cohort", "precalculated-cohort"): | ||
|
@@ -144,17 +239,23 @@ def parse_prop_clauses( | |
method = format_static_cohort_query if prop.type == "static-cohort" else format_precalculated_cohort_query | ||
filter_query, filter_params = method(cohort_id, idx, prepend=prepend, custom_match_field=person_id_joined_alias) # type: ignore | ||
if has_person_id_joined: | ||
final.append(f" AND {filter_query}") | ||
final.append(f" {property_operator} {filter_query}") | ||
else: | ||
# :TODO: (performance) Avoid subqueries whenever possible, use joins instead | ||
# :TODO: Use get_team_distinct_ids_query instead when possible instead of GET_TEAM_PERSON_DISTINCT_IDS | ||
subquery = GET_DISTINCT_IDS_BY_PERSON_ID_FILTER.format( | ||
filters=filter_query, GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS | ||
) | ||
final.append(f"AND {table_name}distinct_id IN ({subquery})") | ||
final.append(f" {property_operator} {table_name}distinct_id IN ({subquery})") | ||
params.update(filter_params) | ||
|
||
return " ".join(final), params | ||
# TODO: clean | ||
joined = f"({' '.join(final).replace(property_operator, '', 1)})" | ||
|
||
if final: | ||
return joined, params | ||
|
||
return "", params | ||
|
||
|
||
def prop_filter_json_extract( | ||
|
@@ -164,6 +265,7 @@ def prop_filter_json_extract( | |
prop_var: str = "properties", | ||
allow_denormalized_props: bool = True, | ||
transform_expression: Optional[Callable[[str], str]] = None, | ||
property_operator: PropertyOperatorType = "AND", | ||
) -> Tuple[str, Dict[str, Any]]: | ||
# TODO: Once all queries are migrated over we can get rid of allow_denormalized_props | ||
if transform_expression is not None: | ||
|
@@ -182,66 +284,76 @@ def prop_filter_json_extract( | |
if operator == "is_not": | ||
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): box_value(prop.value)} | ||
return ( | ||
"AND NOT has(%(v{prepend}_{idx})s, {left})".format(idx=idx, prepend=prepend, left=property_expr), | ||
" {property_operator} NOT has(%(v{prepend}_{idx})s, {left})".format( | ||
idx=idx, prepend=prepend, left=property_expr, property_operator=property_operator | ||
), | ||
params, | ||
) | ||
elif operator == "icontains": | ||
value = "%{}%".format(prop.value) | ||
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): value} | ||
return ( | ||
"AND {left} ILIKE %(v{prepend}_{idx})s".format(idx=idx, prepend=prepend, left=property_expr), | ||
" {property_operator} {left} ILIKE %(v{prepend}_{idx})s".format( | ||
idx=idx, prepend=prepend, left=property_expr, property_operator=property_operator | ||
), | ||
params, | ||
) | ||
elif operator == "not_icontains": | ||
value = "%{}%".format(prop.value) | ||
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): value} | ||
return ( | ||
"AND NOT ({left} ILIKE %(v{prepend}_{idx})s)".format(idx=idx, prepend=prepend, left=property_expr), | ||
" {property_operator} NOT ({left} ILIKE %(v{prepend}_{idx})s)".format( | ||
idx=idx, prepend=prepend, left=property_expr, property_operator=property_operator | ||
), | ||
params, | ||
) | ||
elif operator in ("regex", "not_regex"): | ||
if not is_valid_regex(prop.value): | ||
return "AND 1 = 2", {} | ||
# If OR'ing, shouldn't be a problem since nothing will match this specific clause | ||
return f" {property_operator} 1 = 2", {} | ||
|
||
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} | ||
|
||
return ( | ||
"AND {regex_function}({left}, %(v{prepend}_{idx})s)".format( | ||
" {property_operator} {regex_function}({left}, %(v{prepend}_{idx})s)".format( | ||
regex_function="match" if operator == "regex" else "NOT match", | ||
idx=idx, | ||
prepend=prepend, | ||
left=property_expr, | ||
property_operator=property_operator, | ||
), | ||
params, | ||
) | ||
elif operator == "is_set": | ||
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} | ||
if is_denormalized: | ||
return ( | ||
"AND notEmpty({left})".format(left=property_expr), | ||
" {property_operator} notEmpty({left})".format(left=property_expr, property_operator=property_operator), | ||
params, | ||
) | ||
return ( | ||
"AND JSONHas({prop_var}, %(k{prepend}_{idx})s)".format(idx=idx, prepend=prepend, prop_var=prop_var), | ||
" {property_operator} JSONHas({prop_var}, %(k{prepend}_{idx})s)".format( | ||
idx=idx, prepend=prepend, prop_var=prop_var, property_operator=property_operator | ||
), | ||
params, | ||
) | ||
elif operator == "is_not_set": | ||
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} | ||
if is_denormalized: | ||
return ( | ||
"AND empty({left})".format(left=property_expr), | ||
" {property_operator} empty({left})".format(left=property_expr, property_operator=property_operator), | ||
params, | ||
) | ||
return ( | ||
"AND (isNull({left}) OR NOT JSONHas({prop_var}, %(k{prepend}_{idx})s))".format( | ||
idx=idx, prepend=prepend, prop_var=prop_var, left=property_expr | ||
" {property_operator} (isNull({left}) OR NOT JSONHas({prop_var}, %(k{prepend}_{idx})s))".format( | ||
idx=idx, prepend=prepend, prop_var=prop_var, left=property_expr, property_operator=property_operator, | ||
), | ||
params, | ||
) | ||
elif operator == "is_date_after": | ||
# introducing duplication in these branches now rather than refactor too early | ||
prop_value_param_key = "v{}_{}".format(prepend, idx) | ||
query = f"AND parseDateTimeBestEffortOrNull({property_expr}) > %({prop_value_param_key})s" | ||
query = f" {property_operator} parseDateTimeBestEffortOrNull({property_expr}) > %({prop_value_param_key})s" | ||
|
||
if ( | ||
prop.property_definition is not None | ||
|
@@ -251,7 +363,7 @@ def prop_filter_json_extract( | |
# ClickHouse can only parse 9 or 10 digit unix timestamps. | ||
# So we drop the fractional seconds from millisecond timestamps | ||
# or from after decimal place in seconds timestamps | ||
query = f"AND parseDateTimeBestEffortOrNull(substring({property_expr}, 1, 10)) > %({prop_value_param_key})s" | ||
query = f" {property_operator} parseDateTimeBestEffortOrNull(substring({property_expr}, 1, 10)) > %({prop_value_param_key})s" | ||
|
||
return ( | ||
query, | ||
|
@@ -260,7 +372,7 @@ def prop_filter_json_extract( | |
elif operator == "is_date_before": | ||
# introducing duplication in these branches now rather than refactor too early | ||
prop_value_param_key = "v{}_{}".format(prepend, idx) | ||
query = f"AND parseDateTimeBestEffortOrNull({property_expr}) < %({prop_value_param_key})s" | ||
query = f" {property_operator} parseDateTimeBestEffortOrNull({property_expr}) < %({prop_value_param_key})s" | ||
|
||
if ( | ||
prop.property_definition is not None | ||
|
@@ -270,7 +382,7 @@ def prop_filter_json_extract( | |
# ClickHouse can only parse 9 or 10 digit unix timestamps. | ||
# So we drop the fractional seconds from millisecond timestamps | ||
# or from after decimal place in seconds timestamps | ||
query = f"AND parseDateTimeBestEffortOrNull(substring({property_expr}, 1, 10)) < %({prop_value_param_key})s" | ||
query = f" {property_operator} parseDateTimeBestEffortOrNull(substring({property_expr}, 1, 10)) < %({prop_value_param_key})s" | ||
|
||
return ( | ||
query, | ||
|
@@ -279,31 +391,33 @@ def prop_filter_json_extract( | |
elif operator == "gt": | ||
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} | ||
return ( | ||
"AND toFloat64OrNull(trim(BOTH '\"' FROM replaceRegexpAll({left}, ' ', ''))) > %(v{prepend}_{idx})s".format( | ||
idx=idx, prepend=prepend, left=property_expr, | ||
" {property_operator} toFloat64OrNull(trim(BOTH '\"' FROM replaceRegexpAll({left}, ' ', ''))) > %(v{prepend}_{idx})s".format( | ||
idx=idx, prepend=prepend, left=property_expr, property_operator=property_operator, | ||
), | ||
params, | ||
) | ||
elif operator == "lt": | ||
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} | ||
return ( | ||
"AND toFloat64OrNull(trim(BOTH '\"' FROM replaceRegexpAll({left}, ' ', ''))) < %(v{prepend}_{idx})s".format( | ||
idx=idx, prepend=prepend, left=property_expr, | ||
" {property_operator} toFloat64OrNull(trim(BOTH '\"' FROM replaceRegexpAll({left}, ' ', ''))) < %(v{prepend}_{idx})s".format( | ||
idx=idx, prepend=prepend, left=property_expr, property_operator=property_operator, | ||
), | ||
params, | ||
) | ||
else: | ||
if is_json(prop.value) and not is_denormalized: | ||
clause = "AND has(%(v{prepend}_{idx})s, replaceRegexpAll(visitParamExtractRaw({prop_var}, %(k{prepend}_{idx})s),' ', ''))" | ||
clause = " {property_operator} has(%(v{prepend}_{idx})s, replaceRegexpAll(visitParamExtractRaw({prop_var}, %(k{prepend}_{idx})s),' ', ''))" | ||
params = { | ||
"k{}_{}".format(prepend, idx): prop.key, | ||
"v{}_{}".format(prepend, idx): box_value(prop.value, remove_spaces=True), | ||
} | ||
else: | ||
clause = "AND has(%(v{prepend}_{idx})s, {left})" | ||
clause = " {property_operator} has(%(v{prepend}_{idx})s, {left})" | ||
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): box_value(prop.value)} | ||
return ( | ||
clause.format(left=property_expr, idx=idx, prepend=prepend, prop_var=prop_var), | ||
clause.format( | ||
left=property_expr, idx=idx, prepend=prepend, prop_var=prop_var, property_operator=property_operator | ||
), | ||
params, | ||
) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a smell in that PropertyGroup just wants to be another kind of Property rather than it's own entity.
I suggested this in my initial review - not sure why a different approach was chosen but I strongly disagree with it.