Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add standard deviation requirement for ptax outliters #22

Merged
merged 8 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 46 additions & 3 deletions glue/sales_val_flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,37 @@ def add_rolling_window(df, num_months):
return df


def ptax_adjustment(df, groups, ptax_sd):
"""
This function manually applies a ptax adjustment, keeping only
ptax flags that are outside of a certain standard deviation
range in terms of raw price or price per sqft. It creates the
new column and preserves the old ptax column

Inputs:
df: dataframe after flagging has been done
ptax_sd: a list that look like this - [low sd, high sd]
Outputs:
df: ptax adjusted dataframe
"""

group_string = "_".join(groups)
df["ptax_flag_original"] = df["sale_filter_ptax_flag"]

df["sale_filter_ptax_flag"] = df.apply(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (non-blocking): Rather than overwriting the original column, I would just change the SQL ingest query to return sale_filter_ptax_flag AS ptax_flag_original, then create a new column named ptax_flag_w_deviation (or similar).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call, should now be implemented.

lambda row: row["sale_filter_ptax_flag"]
and (
(row[f"sv_price_deviation_{group_string}"] >= ptax_sd[1])
or (row[f"sv_price_deviation_{group_string}"] <= -ptax_sd[0])
or (row[f"sv_price_per_sqft_deviation_{group_string}"] >= ptax_sd[1])
or (row[f"sv_price_per_sqft_deviation_{group_string}"] <= -ptax_sd[0])
),
axis=1,
)

return df


def group_size_adjustment(df, stat_groups: list, min_threshold, condos: bool):
"""
Within the groups of sales we are looking at to flag outliers, some
Expand Down Expand Up @@ -156,6 +187,7 @@ def finish_flags(df, start_date, manual_update):
run_id: unique run_id used for metadata. etc.
timestamp: unique timestamp for metadata
"""

# Remove duplicate rows
df = df[df["original_observation"]]
# Discard pre-2014 data
Expand Down Expand Up @@ -198,6 +230,7 @@ def finish_flags(df, start_date, manual_update):
"rolling_window",
"sv_is_outlier",
"sv_is_ptax_outlier",
"ptax_flag_original",
"sv_is_heuristic_outlier",
"sv_outlier_type",
]
Expand Down Expand Up @@ -333,6 +366,7 @@ def get_parameter_df(
iso_forest_cols,
stat_groups,
dev_bounds,
ptax_sd,
rolling_window,
date_floor,
short_term_thresh,
Expand Down Expand Up @@ -364,6 +398,7 @@ def get_parameter_df(
dev_bounds = dev_bounds
date_floor = date_floor
rolling_window = rolling_window
ptax_sd = ptax_sd
min_group_thresh = min_group_thresh

parameter_dict_to_df = {
Expand All @@ -375,6 +410,7 @@ def get_parameter_df(
"iso_forest_cols": [iso_forest_cols],
"stat_groups": [stat_groups],
"dev_bounds": [dev_bounds],
"ptax_sd": [ptax_sd],
"rolling_window": [rolling_window],
"date_floor": [date_floor],
"min_group_thresh": [min_group_thresh],
Expand Down Expand Up @@ -449,6 +485,7 @@ def write_to_table(df, table_name, s3_warehouse_bucket_path, run_id):
"iso_forest",
"min_groups_threshold",
"dev_bounds",
"ptax_sd",
],
)

Expand Down Expand Up @@ -624,6 +661,7 @@ def write_to_table(df, table_name, s3_warehouse_bucket_path, run_id):
iso_forest_list = args["iso_forest"].split(",")
dev_bounds_list = list(map(int, args["dev_bounds"].split(",")))
dev_bounds_tuple = tuple(map(int, args["dev_bounds"].split(",")))
ptax_sd_list = list(map(int, args["ptax_sd"].split(",")))

# Flag Res Outliers
df_res_flagged = go(
Expand All @@ -636,7 +674,7 @@ def write_to_table(df, table_name, s3_warehouse_bucket_path, run_id):

df_res_flagged_updated = group_size_adjustment(
df=df_res_flagged,
stat_groups=tuple(stat_groups_list),
stat_groups=stat_groups_list,
min_threshold=int(args["min_groups_threshold"]),
condos=False,
)
Expand All @@ -655,7 +693,7 @@ def write_to_table(df, table_name, s3_warehouse_bucket_path, run_id):

df_condo_flagged_updated = group_size_adjustment(
df=df_condo_flagged,
stat_groups=tuple(stat_groups_list),
stat_groups=stat_groups_list,
min_threshold=int(args["min_groups_threshold"]),
condos=True,
)
Expand All @@ -664,9 +702,13 @@ def write_to_table(df, table_name, s3_warehouse_bucket_path, run_id):
[df_res_flagged_updated, df_condo_flagged_updated]
).reset_index(drop=True)

df_flagged_ptax = ptax_adjustment(
df=df_flagged_merged, groups=stat_groups_list, ptax_sd=ptax_sd_list
)

# Finish flagging
df_flagged_final, run_id, timestamp = finish_flags(
df=df_flagged_merged,
df=df_flagged_ptax,
start_date=args["time_frame_start"],
manual_update=False,
)
Expand All @@ -693,6 +735,7 @@ def write_to_table(df, table_name, s3_warehouse_bucket_path, run_id):
iso_forest_cols=iso_forest_list,
stat_groups=stat_groups_list,
dev_bounds=dev_bounds_list,
ptax_sd=ptax_sd_list,
rolling_window=int(args["rolling_window_num"]),
date_floor=args["time_frame_start"],
short_term_thresh=SHORT_TERM_OWNER_THRESHOLD,
Expand Down
7 changes: 6 additions & 1 deletion manual_flagging/initial_flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,13 @@
[df_res_flagged_updated, df_condo_flagged_updated]
).reset_index(drop=True)

df_flagged_ptax = flg.ptax_adjustment(
df=df_flagged_merged, groups=inputs["stat_groups"], ptax_sd=inputs["ptax_sd"]
)

# Finish flagging and subset to write to flag table
df_to_write, run_id, timestamp = flg.finish_flags(
df=df_flagged_merged,
df=df_flagged_ptax,
start_date=inputs["time_frame"]["start"],
manual_update=False,
)
Expand All @@ -189,6 +193,7 @@
iso_forest_cols=inputs["iso_forest"],
stat_groups=inputs["stat_groups"],
dev_bounds=inputs["dev_bounds"],
ptax_sd=inputs["ptax_sd"],
rolling_window=inputs["rolling_window_months"],
date_floor=inputs["time_frame"]["start"],
short_term_thresh=SHORT_TERM_OWNER_THRESHOLD,
Expand Down
9 changes: 7 additions & 2 deletions manual_flagging/manual_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,13 @@
[df_res_flagged_updated, df_condo_flagged_updated]
).reset_index(drop=True)

# Finish flagging and subset to write to flag table
df_flagged_ptax = flg.ptax_adjustment(
df=df_flagged_merged, groups=inputs["stat_groups"], ptax_sd=inputs["ptax_sd"]
)

# Finish flagging
df_flagged_final, run_id, timestamp = flg.finish_flags(
df=df_flagged_merged,
df=df_flagged_ptax,
start_date=inputs["time_frame"]["start"],
manual_update=True,
)
Expand Down Expand Up @@ -224,6 +228,7 @@
iso_forest_cols=inputs["iso_forest"],
stat_groups=inputs["stat_groups"],
dev_bounds=inputs["dev_bounds"],
ptax_sd=inputs["ptax_sd"],
rolling_window=inputs["rolling_window_months"],
date_floor=inputs["time_frame"]["start"],
short_term_thresh=SHORT_TERM_OWNER_THRESHOLD,
Expand Down
5 changes: 5 additions & 0 deletions manual_flagging/yaml/inputs_initial.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ time_frame:
start: "2019-01-01"
end: "2020-06-30"

# How many total months to include in the grouping methodology
rolling_window_months: 12

# PTAX flags are only kept if the raw or sqft price is at least this
# many standard devciations away from the mean
ptax_sd: [1, 1]

min_groups_threshold: 30
5 changes: 5 additions & 0 deletions manual_flagging/yaml/inputs_update.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ time_frame:
start: "2019-01-01"
end: "2021-12-31"

# How many total months to include in the grouping methodology
rolling_window_months: 12

# PTAX flags are only kept if the raw or sqft price is at least this
# many standard devciations away from the mean
ptax_sd: [1, 1]

min_groups_threshold: 30