Skip to content

Commit

Permalink
Update data / parameters for 2025 model and implement forward filling (
Browse files Browse the repository at this point in the history
…#330)

* Update params

* Remove from setup

* Update dvc

* Include forward filling

* mutate to character

* Move process arrays up

* tidy up

* remove duplicated code

* push dvc changes

* update forward filling

* Improve forward filling to work with training_data

* remove duplicate loc_ processing

* lintr

* lintr

* styler

* push dvc

* Revert dvc.yaml

* Update input data with forward fill

* Bump ccao and assessr packages

* Add data.table forward filling

* Update input data

* Style ingest script

---------

Co-authored-by: Dan Snow <daniel.snow@cookcountyil.gov>
  • Loading branch information
Damonamajor and dfsnow authored Jan 23, 2025
1 parent c46937c commit f823872
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 99 deletions.
14 changes: 7 additions & 7 deletions analyses/new-feature-template.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@ format:
fig-align: center
fontsize: 12pt
params:
run_id: "2024-07-03-charming-boni"
run_id_year: "2024"
comparison_run_id: "2024-07-13-great-eric"
comparison_run_id_year: "2024"
added_feature: "prox_nearest_new_construction_dist_ft"
added_feature_shap: "prox_nearest_new_construction_dist_ft_shap"
description: "A distance in feet to the nearest new construction"
run_id: "2025-01-13-dazzling-kyra"
run_id_year: "2025"
comparison_run_id: "2025-01-10-serene-boni"
comparison_run_id_year: "2025"
added_feature: "time_sale_roll_mean_nbhd_t0_w3"
added_feature_shap: "time_sale_roll_mean_nbhd_t0_w3_shap"
description: "Added feature to calculate neighborhood rolling averages of sale price"
min_range: 5
max_range: 95
type: "continuous"
Expand Down
38 changes: 19 additions & 19 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@ stages:
deps:
- path: pipeline/00-ingest.R
hash: md5
md5: 3fcf8cbce89340948f394825b1c78187
size: 23441
md5: 6aa03fd110dcc51b509585c22c7d7ff7
size: 24625
params:
params.yaml:
assessment:
year: '2024'
date: '2024-01-01'
year: '2025'
date: '2025-01-01'
triad: north
group: residential
data_year: '2023'
data_year: '2024'
working_year: '2025'
input:
min_sale_year: '2015'
max_sale_year: '2023'
min_sale_year: '2016'
max_sale_year: '2024'
n_years_prior: 4
complex:
match_exact:
Expand All @@ -38,28 +38,28 @@ stages:
outs:
- path: input/assessment_data.parquet
hash: md5
md5: cecaf4aee89d2269bd059f536e611101
size: 425453415
md5: ba6f103742f30bba7da6759728fd8bb1
size: 415029836
- path: input/char_data.parquet
hash: md5
md5: 121f3017e46d8acd1f19ba689dca9726
size: 848066404
md5: f783f4629433bf1b5e8f1ef3033cba0a
size: 840256376
- path: input/complex_id_data.parquet
hash: md5
md5: 835be789fc9ef09f3bfa1d5c8465f6e6
size: 704175
md5: 892c2a64cd5f8f35f9a4c9608fa1464e
size: 701036
- path: input/hie_data.parquet
hash: md5
md5: ec600ed3e19b5b48059ce1d270b114b5
size: 1911086
md5: fcaa54bf22240ed6fcacd2b7b6a34941
size: 1924013
- path: input/land_nbhd_rate_data.parquet
hash: md5
md5: f3ec9627322bd271bf2957b7388aaa34
size: 3873
md5: 5fe80edfabdfac91efe888a25ee4051c
size: 6019
- path: input/training_data.parquet
hash: md5
md5: 3156fd30394ae3fb9eda7e0d0176ab2f
size: 208501951
md5: 18addc910a0ccbf7bb5b2fd9c7923f2d
size: 204978802
train:
cmd: Rscript pipeline/01-train.R
deps:
Expand Down
18 changes: 9 additions & 9 deletions params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ run_type: "test"

# Note included with each run. Use this to summarize what changed about the run
# or add context
run_note: Preparing for 2025 model with 2024 data
run_note: Preparing for 2025 model with 2024 data and updated sales

toggle:
# Should the train stage run full cross-validation? Otherwise, the model
Expand All @@ -39,10 +39,10 @@ toggle:
# Assessment context and dates
assessment:
# Year of assessment. Used to pull land rates, HIEs, and other information
year: "2024"
year: "2025"

# The statutorily set "sale date" for the purpose of prediction
date: "2024-01-01"
date: "2025-01-01"

# Added context for model artifacts stored in S3. Also updates the triad
# displayed in email notifications on model completion
Expand All @@ -51,7 +51,7 @@ assessment:

# Year from which property characteristics are pulled. Usually lags the
# assessment year by 1
data_year: "2023"
data_year: "2024"

# Year used to partition data on S3. Working year in this case means
# the year the Data Department is currently creating models for
Expand All @@ -60,8 +60,8 @@ assessment:
# Parameters used to define the input/training data
input:
# The min and max year of sales to use for the training data sample
min_sale_year: "2015"
max_sale_year: "2023"
min_sale_year: "2016"
max_sale_year: "2024"

# Number of years back to look for count_past_n_years feature
n_years_prior: 4
Expand Down Expand Up @@ -138,7 +138,7 @@ model:

# Parameters related to model determinism. Current settings should force
# the same output every time if the same hyperparameters are used
seed: 2024
seed: 2025
deterministic: true
force_row_wise: true

Expand Down Expand Up @@ -386,10 +386,10 @@ pv:
# (post-appeal) from the last reassessment and the most recent values from the
# prior year
ratio_study:
far_year: "2021"
far_year: "2022"
far_stage: "board"
far_column: "meta_2yr_pri_board_tot"
near_year: "2023"
near_year: "2024"
near_stage: "certified"
near_column: "meta_certified_tot"

Expand Down
159 changes: 102 additions & 57 deletions pipeline/00-ingest.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,52 @@ AWS_ATHENA_CONN_NOCTUA <- dbConnect(


#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 2. Pull Data -----------------------------------------------------------------
# 2. Define Functions ----------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Ingest-specific helper functions for data cleaning, etc.

# Create a dictionary of column types, as specified in ccao::vars_dict
col_type_dict <- ccao::vars_dict %>%
distinct(var_name = var_name_model, var_type = var_data_type) %>%
drop_na(var_name)

# Mini-function to ensure that columns are the correct type
recode_column_type <- function(col, col_name, dictionary = col_type_dict) {
col_type <- dictionary %>%
filter(var_name == col_name) %>%
pull(var_type)
switch(col_type,
numeric = as.numeric(col),
character = as.character(col),
logical = as.logical(as.numeric(col)),
categorical = as.factor(col),
date = lubridate::as_date(col)
)
}

# Mini function to deal with arrays
# Some Athena columns are stored as arrays but are converted to string on
# ingest. In such cases, we either keep the contents of the cell (if 1 unit),
# collapse the array into a comma-separated string (if more than 1 unit),
# or replace with NA if the array is empty
process_array_column <- function(x) {
purrr::map_chr(x, function(cell) {
if (length(cell) > 1) {
paste(cell, collapse = ", ")
} else if (length(cell) == 1) {
as.character(cell) # Convert the single element to character
} else {
NA # Handle cases where the array is empty
}
})
}




#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 3. Pull Data -----------------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
message("Pulling data from Athena")

Expand Down Expand Up @@ -68,7 +113,6 @@ training_data <- dbGetQuery(
AND NOT sale.sale_filter_same_sale_within_365
AND NOT sale.sale_filter_less_than_10k
AND NOT sale.sale_filter_deed_type
AND Year(sale.sale_date) >= {params$input$min_sale_year}
")
)
tictoc::toc()
Expand Down Expand Up @@ -102,6 +146,60 @@ assessment_data <- dbGetQuery(
)
tictoc::toc()




##### START TEMPORARY FIX FOR MISSING DATA. REMOVE ONCE 2024 DATA IS AVAILABLE
library(data.table)
conflict_prefer_all("dplyr", "data.table", quiet = TRUE)
conflict_prefer_all("lubridate", "data.table", quiet = TRUE)
fill_cols <- assessment_data %>%
select(
starts_with("loc_"),
starts_with("prox_"),
starts_with("acs5_"),
starts_with("other_"),
starts_with("shp_")
) %>%
names()
assessment_data_temp <- as.data.table(assessment_data) %>%
mutate(across(starts_with("loc_tax_"), process_array_column))
assessment_data_temp_2024 <- assessment_data_temp[
meta_year == "2024",
][
assessment_data_temp[meta_year == "2023"],
(fill_cols) := mget(paste0("i.", fill_cols)),
on = .(meta_pin, meta_card_num)
]
assessment_data <- rbind(
assessment_data_temp[meta_year != "2024"],
assessment_data_temp_2024
) %>%
as_tibble()

training_data_temp <- as.data.table(training_data) %>%
mutate(across(starts_with("loc_tax_"), process_array_column))
training_data_temp_2024 <- training_data_temp[
meta_year == "2024",
][
assessment_data_temp[meta_year == "2023"],
(fill_cols) := mget(paste0("i.", fill_cols)),
on = .(meta_pin, meta_card_num)
]
training_data <- rbind(
training_data_temp[meta_year != "2024"],
training_data_temp_2024
) %>%
as_tibble()
rm(
assessment_data_temp, assessment_data_temp_2024,
training_data_temp, training_data_temp_2024
)
##### END TEMPORARY FIX




# Save both years for report generation using the characteristics
assessment_data %>%
write_parquet(paths$input$char$local)
Expand All @@ -128,58 +226,6 @@ rm(AWS_ATHENA_CONN_NOCTUA)



#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 3. Define Functions ----------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Ingest-specific helper functions for data cleaning, etc.

# Create a dictionary of column types, as specified in ccao::vars_dict
col_type_dict <- ccao::vars_dict %>%
distinct(var_name = var_name_model, var_type = var_data_type) %>%
drop_na(var_name)

# Mini-function to ensure that columns are the correct type
recode_column_type <- function(col, col_name, dictionary = col_type_dict) {
col_type <- dictionary %>%
filter(var_name == col_name) %>%
pull(var_type)
switch(col_type,
numeric = as.numeric(col),
character = as.character(col),
logical = as.logical(as.numeric(col)),
categorical = as.factor(col),
date = lubridate::as_date(col)
)
}


# Mini function to deal with arrays
# Some Athena columns are stored as arrays but are converted to string on
# ingest. In such cases, we either keep the contents of the cell (if 1 unit),
# collapse the array into a comma-separated string (if more than 1 unit),
# or replace with NA if the array is empty
process_array_columns <- function(data, selector) {
data %>%
mutate(
across(
!!enquo(selector),
~ sapply(.x, function(cell) {
if (length(cell) > 1) {
paste(cell, collapse = ", ")
} else if (length(cell) == 1) {
as.character(cell) # Convert the single element to character
} else {
NA # Handle cases where the array is empty
}
})
)
)
}




#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 4. Home Improvement Exemptions -----------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down Expand Up @@ -312,9 +358,8 @@ training_data_clean <- training_data_w_hie %>%
),
char_ncu = ifelse(char_class == "212" & !is.na(char_ncu), char_ncu, 0)
) %>%
# Apply the helper function to process array columns
process_array_columns(starts_with("loc_tax_")) %>%
mutate(
across(starts_with("loc_tax_"), process_array_column),
loc_tax_municipality_name =
replace_na(loc_tax_municipality_name, "UNINCORPORATED")
) %>%
Expand Down Expand Up @@ -428,8 +473,8 @@ assessment_data_clean <- assessment_data_w_hie %>%
as_factor = FALSE
) %>%
# Apply the helper function to process array columns
process_array_columns(starts_with("loc_tax_")) %>%
mutate(
across(starts_with("loc_tax_"), process_array_column),
loc_tax_municipality_name =
replace_na(loc_tax_municipality_name, "UNINCORPORATED")
) %>%
Expand Down
Loading

0 comments on commit f823872

Please sign in to comment.