Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Or property filtering API #8012

Merged
merged 30 commits into from
Feb 14, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 146 additions & 32 deletions ee/clickhouse/models/property.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
Literal,
Optional,
Tuple,
Union,
cast,
)

Expand Down Expand Up @@ -40,10 +41,100 @@
Property,
PropertyIdentifier,
PropertyName,
PropertyOperatorType,
)
from posthog.models.team import Team
from posthog.utils import is_valid_regex, relative_date_parse

# Example:
# {type: 'AND', properties: [
# {type: 'OR', properties: [A, B, C]},
# {type: 'OR', properties: [D, E, F]},
# G, # invalid? Can enforce this via UI: if groups, everything else needs to be a group too.
# ]}

# Example:
# {type: 'AND', properties: [
# A, B, C, D
# ]}


# Property json is of the form:
# { type: 'AND | OR', properties: List[Property] }
# which is parsed and sent to this function ->
class PropertyGroup:
type: PropertyOperatorType
properties: Union[List[Property], List["PropertyGroup"]]

def __init__(self, type: PropertyOperatorType, properties: Union[List[Property], List["PropertyGroup"]]) -> None:
self.type = type
self.properties = properties


def parse_prop_grouped_clauses(
filter_group: PropertyGroup,
prepend: str = "global",
table_name: str = "",
allow_denormalized_props: bool = True,
has_person_id_joined: bool = True,
person_properties_mode: PersonPropertiesMode = PersonPropertiesMode.USING_SUBQUERY,
person_id_joined_alias: str = "person_id",
group_properties_joined: bool = True,
_top_level: bool = True,
) -> Tuple[str, Dict]:
# TODO: setup structure of new properties, and call parse_prop_clauses with right operator type

group: Union[Property, PropertyGroup]

if len(filter_group.properties) == 0:
return "", {}

if isinstance(filter_group.properties[0], PropertyGroup):
group_clauses = []
final_params = {}
group: PropertyGroup
for idx, group in enumerate(filter_group.properties):
clause, params = parse_prop_grouped_clauses(
filter_group=group,
prepend=f"{prepend}_{idx}",
table_name=table_name,
allow_denormalized_props=allow_denormalized_props,
has_person_id_joined=has_person_id_joined,
person_properties_mode=person_properties_mode,
person_id_joined_alias=person_id_joined_alias,
group_properties_joined=group_properties_joined,
_top_level=False,
)
group_clauses.append(clause)
final_params.update(params)

_final = f"{filter_group.type} ".join(group_clauses)
else:
_final, final_params = parse_prop_clauses(
filters=filter_group.properties,
prepend=f"{prepend}",
table_name=table_name,
allow_denormalized_props=allow_denormalized_props,
has_person_id_joined=has_person_id_joined,
person_properties_mode=person_properties_mode,
person_id_joined_alias=person_id_joined_alias,
group_properties_joined=group_properties_joined,
property_operator=filter_group.type,
)
if _top_level:
final = f"AND ({_final})"
else:
final = f"({_final})"

return final, final_params


def is_property_group(group: Union[Property, "PropertyGroup"]):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a smell in that PropertyGroup just wants to be another kind of Property rather than it's own entity.

I suggested this in my initial review - not sure why a different approach was chosen but I strongly disagree with it.

if isinstance(group, PropertyGroup):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return isinstance(group, PropertyGroup)

Or even inline?

return True
else:
return False


def parse_prop_clauses(
filters: List[Property],
Expand All @@ -54,6 +145,7 @@ def parse_prop_clauses(
person_properties_mode: PersonPropertiesMode = PersonPropertiesMode.USING_SUBQUERY,
person_id_joined_alias: str = "person_id",
group_properties_joined: bool = True,
property_operator: PropertyOperatorType = "AND",
) -> Tuple[str, Dict]:
final = []
params: Dict[str, Any] = {}
Expand All @@ -65,13 +157,13 @@ def parse_prop_clauses(
try:
cohort = Cohort.objects.get(pk=prop.value)
except Cohort.DoesNotExist:
final.append("AND 0 = 13") # If cohort doesn't exist, nothing can match
final.append(
f" {property_operator} 0 = 13"
) # If cohort doesn't exist, nothing can match, unless an OR operator is used
else:
person_id_query, cohort_filter_params = format_filter_query(cohort, idx)
params = {**params, **cohort_filter_params}
final.append(
"AND {table_name}distinct_id IN ({clause})".format(table_name=table_name, clause=person_id_query)
)
final.append(f" {property_operator} {table_name}distinct_id IN ({person_id_query})")
elif prop.type == "person" and person_properties_mode != PersonPropertiesMode.EXCLUDE:
# :TODO: Clean this up by using ClickhousePersonQuery over GET_DISTINCT_IDS_BY_PROPERTY_SQL to have access
# to materialized columns
Expand All @@ -83,17 +175,19 @@ def parse_prop_clauses(
"{}person".format(prepend),
prop_var="person_props" if is_direct_query else "properties",
allow_denormalized_props=allow_denormalized_props and is_direct_query,
property_operator="AND",
Copy link
Member Author

@EDsCODE EDsCODE Feb 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Super tricky. This is a solo condition that's always pushed into a subquery so it needs to be AND. It's unrelated from the larger logic for the overall condition it's forming (addressed in test_parse_groups_persons)

)
if is_direct_query:
final.append(filter_query)
params.update(filter_params)
else:
final.append(
"AND {table_name}distinct_id IN ({filter_query})".format(
" {property_operator} {table_name}distinct_id IN ({filter_query})".format(
filter_query=GET_DISTINCT_IDS_BY_PROPERTY_SQL.format(
filters=filter_query, GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS
),
table_name=table_name,
property_operator=property_operator,
)
)
params.update(filter_params)
Expand All @@ -102,7 +196,7 @@ def parse_prop_clauses(
{prop.key: prop.value}, operator=prop.operator, prepend="{}_".format(idx)
)
if query:
final.append(f" AND {query}")
final.append(f" {property_operator} {query}")
params.update(filter_params)
elif prop.type == "event":
filter_query, filter_params = prop_filter_json_extract(
Expand All @@ -111,8 +205,8 @@ def parse_prop_clauses(
prepend,
prop_var="{}properties".format(table_name),
allow_denormalized_props=allow_denormalized_props,
property_operator=property_operator,
)

final.append(f" {filter_query}")
params.update(filter_params)
elif prop.type == "group":
Expand All @@ -123,6 +217,7 @@ def parse_prop_clauses(
prepend,
prop_var=f"group_properties_{prop.group_type_index}",
allow_denormalized_props=False,
property_operator=property_operator,
)
final.append(filter_query)
params.update(filter_params)
Expand All @@ -135,7 +230,7 @@ def parse_prop_clauses(
groups_subquery = GET_GROUP_IDS_BY_PROPERTY_SQL.format(
filters=filter_query, group_type_index_var=group_type_index_var
)
final.append(f"AND {table_name}$group_{prop.group_type_index} IN ({groups_subquery})")
final.append(f" {property_operator} {table_name}$group_{prop.group_type_index} IN ({groups_subquery})")
params.update(filter_params)
params[group_type_index_var] = prop.group_type_index
elif prop.type in ("static-cohort", "precalculated-cohort"):
Expand All @@ -144,17 +239,23 @@ def parse_prop_clauses(
method = format_static_cohort_query if prop.type == "static-cohort" else format_precalculated_cohort_query
filter_query, filter_params = method(cohort_id, idx, prepend=prepend, custom_match_field=person_id_joined_alias) # type: ignore
if has_person_id_joined:
final.append(f" AND {filter_query}")
final.append(f" {property_operator} {filter_query}")
else:
# :TODO: (performance) Avoid subqueries whenever possible, use joins instead
# :TODO: Use get_team_distinct_ids_query instead when possible instead of GET_TEAM_PERSON_DISTINCT_IDS
subquery = GET_DISTINCT_IDS_BY_PERSON_ID_FILTER.format(
filters=filter_query, GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS
)
final.append(f"AND {table_name}distinct_id IN ({subquery})")
final.append(f" {property_operator} {table_name}distinct_id IN ({subquery})")
params.update(filter_params)

return " ".join(final), params
# TODO: clean
joined = f"({' '.join(final).replace(property_operator, '', 1)})"

if final:
return joined, params

return "", params


def prop_filter_json_extract(
Expand All @@ -164,6 +265,7 @@ def prop_filter_json_extract(
prop_var: str = "properties",
allow_denormalized_props: bool = True,
transform_expression: Optional[Callable[[str], str]] = None,
property_operator: PropertyOperatorType = "AND",
) -> Tuple[str, Dict[str, Any]]:
# TODO: Once all queries are migrated over we can get rid of allow_denormalized_props
if transform_expression is not None:
Expand All @@ -182,66 +284,76 @@ def prop_filter_json_extract(
if operator == "is_not":
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): box_value(prop.value)}
return (
"AND NOT has(%(v{prepend}_{idx})s, {left})".format(idx=idx, prepend=prepend, left=property_expr),
" {property_operator} NOT has(%(v{prepend}_{idx})s, {left})".format(
idx=idx, prepend=prepend, left=property_expr, property_operator=property_operator
),
params,
)
elif operator == "icontains":
value = "%{}%".format(prop.value)
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): value}
return (
"AND {left} ILIKE %(v{prepend}_{idx})s".format(idx=idx, prepend=prepend, left=property_expr),
" {property_operator} {left} ILIKE %(v{prepend}_{idx})s".format(
idx=idx, prepend=prepend, left=property_expr, property_operator=property_operator
),
params,
)
elif operator == "not_icontains":
value = "%{}%".format(prop.value)
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): value}
return (
"AND NOT ({left} ILIKE %(v{prepend}_{idx})s)".format(idx=idx, prepend=prepend, left=property_expr),
" {property_operator} NOT ({left} ILIKE %(v{prepend}_{idx})s)".format(
idx=idx, prepend=prepend, left=property_expr, property_operator=property_operator
),
params,
)
elif operator in ("regex", "not_regex"):
if not is_valid_regex(prop.value):
return "AND 1 = 2", {}
# If OR'ing, shouldn't be a problem since nothing will match this specific clause
return f" {property_operator} 1 = 2", {}

params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value}

return (
"AND {regex_function}({left}, %(v{prepend}_{idx})s)".format(
" {property_operator} {regex_function}({left}, %(v{prepend}_{idx})s)".format(
regex_function="match" if operator == "regex" else "NOT match",
idx=idx,
prepend=prepend,
left=property_expr,
property_operator=property_operator,
),
params,
)
elif operator == "is_set":
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value}
if is_denormalized:
return (
"AND notEmpty({left})".format(left=property_expr),
" {property_operator} notEmpty({left})".format(left=property_expr, property_operator=property_operator),
params,
)
return (
"AND JSONHas({prop_var}, %(k{prepend}_{idx})s)".format(idx=idx, prepend=prepend, prop_var=prop_var),
" {property_operator} JSONHas({prop_var}, %(k{prepend}_{idx})s)".format(
idx=idx, prepend=prepend, prop_var=prop_var, property_operator=property_operator
),
params,
)
elif operator == "is_not_set":
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value}
if is_denormalized:
return (
"AND empty({left})".format(left=property_expr),
" {property_operator} empty({left})".format(left=property_expr, property_operator=property_operator),
params,
)
return (
"AND (isNull({left}) OR NOT JSONHas({prop_var}, %(k{prepend}_{idx})s))".format(
idx=idx, prepend=prepend, prop_var=prop_var, left=property_expr
" {property_operator} (isNull({left}) OR NOT JSONHas({prop_var}, %(k{prepend}_{idx})s))".format(
idx=idx, prepend=prepend, prop_var=prop_var, left=property_expr, property_operator=property_operator,
),
params,
)
elif operator == "is_date_after":
# introducing duplication in these branches now rather than refactor too early
prop_value_param_key = "v{}_{}".format(prepend, idx)
query = f"AND parseDateTimeBestEffortOrNull({property_expr}) > %({prop_value_param_key})s"
query = f" {property_operator} parseDateTimeBestEffortOrNull({property_expr}) > %({prop_value_param_key})s"

if (
prop.property_definition is not None
Expand All @@ -251,7 +363,7 @@ def prop_filter_json_extract(
# ClickHouse can only parse 9 or 10 digit unix timestamps.
# So we drop the fractional seconds from millisecond timestamps
# or from after decimal place in seconds timestamps
query = f"AND parseDateTimeBestEffortOrNull(substring({property_expr}, 1, 10)) > %({prop_value_param_key})s"
query = f" {property_operator} parseDateTimeBestEffortOrNull(substring({property_expr}, 1, 10)) > %({prop_value_param_key})s"

return (
query,
Expand All @@ -260,7 +372,7 @@ def prop_filter_json_extract(
elif operator == "is_date_before":
# introducing duplication in these branches now rather than refactor too early
prop_value_param_key = "v{}_{}".format(prepend, idx)
query = f"AND parseDateTimeBestEffortOrNull({property_expr}) < %({prop_value_param_key})s"
query = f" {property_operator} parseDateTimeBestEffortOrNull({property_expr}) < %({prop_value_param_key})s"

if (
prop.property_definition is not None
Expand All @@ -270,7 +382,7 @@ def prop_filter_json_extract(
# ClickHouse can only parse 9 or 10 digit unix timestamps.
# So we drop the fractional seconds from millisecond timestamps
# or from after decimal place in seconds timestamps
query = f"AND parseDateTimeBestEffortOrNull(substring({property_expr}, 1, 10)) < %({prop_value_param_key})s"
query = f" {property_operator} parseDateTimeBestEffortOrNull(substring({property_expr}, 1, 10)) < %({prop_value_param_key})s"

return (
query,
Expand All @@ -279,31 +391,33 @@ def prop_filter_json_extract(
elif operator == "gt":
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value}
return (
"AND toFloat64OrNull(trim(BOTH '\"' FROM replaceRegexpAll({left}, ' ', ''))) > %(v{prepend}_{idx})s".format(
idx=idx, prepend=prepend, left=property_expr,
" {property_operator} toFloat64OrNull(trim(BOTH '\"' FROM replaceRegexpAll({left}, ' ', ''))) > %(v{prepend}_{idx})s".format(
idx=idx, prepend=prepend, left=property_expr, property_operator=property_operator,
),
params,
)
elif operator == "lt":
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value}
return (
"AND toFloat64OrNull(trim(BOTH '\"' FROM replaceRegexpAll({left}, ' ', ''))) < %(v{prepend}_{idx})s".format(
idx=idx, prepend=prepend, left=property_expr,
" {property_operator} toFloat64OrNull(trim(BOTH '\"' FROM replaceRegexpAll({left}, ' ', ''))) < %(v{prepend}_{idx})s".format(
idx=idx, prepend=prepend, left=property_expr, property_operator=property_operator,
),
params,
)
else:
if is_json(prop.value) and not is_denormalized:
clause = "AND has(%(v{prepend}_{idx})s, replaceRegexpAll(visitParamExtractRaw({prop_var}, %(k{prepend}_{idx})s),' ', ''))"
clause = " {property_operator} has(%(v{prepend}_{idx})s, replaceRegexpAll(visitParamExtractRaw({prop_var}, %(k{prepend}_{idx})s),' ', ''))"
params = {
"k{}_{}".format(prepend, idx): prop.key,
"v{}_{}".format(prepend, idx): box_value(prop.value, remove_spaces=True),
}
else:
clause = "AND has(%(v{prepend}_{idx})s, {left})"
clause = " {property_operator} has(%(v{prepend}_{idx})s, {left})"
params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): box_value(prop.value)}
return (
clause.format(left=property_expr, idx=idx, prepend=prepend, prop_var=prop_var),
clause.format(
left=property_expr, idx=idx, prepend=prepend, prop_var=prop_var, property_operator=property_operator
),
params,
)

Expand Down
Loading