Skip to content

Commit

Permalink
Fix complex ID incorrect aggregation
Browse files Browse the repository at this point in the history
Townhomes receive the mean of their "complex", but the complex ID code
was assigning multi-card properties to the same complex as single-card
ones, pulling up the average. Also, occassionally the "fuzzy grouping"
produced fuzzy matched "chains" of properties with ever-increasing sqft.
This new addition breaks those chains
  • Loading branch information
dfsnow committed Mar 1, 2023
1 parent eb52565 commit 98283e3
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
4 changes: 3 additions & 1 deletion params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ input:
"char_bsmt",
"char_gar1_size",
"char_attic_fnsh",
"char_beds"
"char_beds",
"meta_pin_num_cards",
"meta_tieback_proration_rate"
]

# Townhomes should match fuzzily on these variables to be in the same
Expand Down
24 changes: 22 additions & 2 deletions pipeline/00-ingest.R
Original file line number Diff line number Diff line change
Expand Up @@ -425,8 +425,9 @@ complex_id_temp <- assessment_data_clean %>%
# Self-join with attributes that must be exactly matching
select(
meta_pin, meta_card_num, meta_township_code, meta_class,
char_bsmt, char_gar1_size, char_attic_fnsh, char_beds,
char_rooms, char_bldg_sf, char_yrblt, loc_x_3435, loc_y_3435
all_of(params$input$complex$match_exact),
any_of(paste0("char_", names(params$input$complex$match_fuzzy))),
loc_x_3435, loc_y_3435
) %>%
full_join(
eval(.),
Expand Down Expand Up @@ -476,6 +477,25 @@ complex_id_data <- assessment_data_clean %>%
meta_complex_id,
lag(meta_complex_id) + 1
)) %>%
# Break long "chains" of fuzzy matched properties into separate groups if the
# chain spans more than the allowed square foot difference
left_join(
assessment_data_clean %>%
filter(meta_class %in% c("210", "295")) %>%
group_by(meta_pin) %>%
summarize(tot_sqft = sum(char_bldg_sf)),
by = "meta_pin"
) %>%
group_by(meta_complex_id) %>%
mutate(
char_break = floor(tot_sqft / params$input$complex$match_fuzzy$bldg_sf),
char_break = char_break - min(char_break),
char_break = floor(char_break / 2)
) %>%
group_by(meta_complex_id, char_break) %>%
mutate(meta_complex_id = cur_group_id()) %>%
ungroup() %>%
select(-c(tot_sqft, char_break)) %>%
write_parquet(paths$input$complex_id$local)


Expand Down

0 comments on commit 98283e3

Please sign in to comment.