Update data / parameters for 2025 model and implement forward filling (…

…#330) * Update params * Remove from setup * Update dvc * Include forward filling * mutate to character * Move process arrays up * tidy up * remove duplicated code * push dvc changes * update forward filling * Improve forward filling to work with training_data * remove duplicate loc_ processing * lintr * lintr * styler * push dvc * Revert dvc.yaml * Update input data with forward fill * Bump ccao and assessr packages * Add data.table forward filling * Update input data * Style ingest script --------- Co-authored-by: Dan Snow <daniel.snow@cookcountyil.gov>
ccao-data · Jan 23, 2025 · f823872 · f823872
1 parent c46937c
commit f823872
Show file tree

Hide file tree

Showing 5 changed files with 144 additions and 99 deletions.
diff --git a/analyses/new-feature-template.qmd b/analyses/new-feature-template.qmd
@@ -15,13 +15,13 @@ format:
     fig-align: center
     fontsize: 12pt
 params:
-  run_id: "2024-07-03-charming-boni"
-  run_id_year: "2024"
-  comparison_run_id: "2024-07-13-great-eric"
-  comparison_run_id_year: "2024"
-  added_feature: "prox_nearest_new_construction_dist_ft"
-  added_feature_shap: "prox_nearest_new_construction_dist_ft_shap"
-  description: "A distance in feet to the nearest new construction"
+  run_id: "2025-01-13-dazzling-kyra"
+  run_id_year: "2025"
+  comparison_run_id: "2025-01-10-serene-boni"
+  comparison_run_id_year: "2025"
+  added_feature: "time_sale_roll_mean_nbhd_t0_w3"
+  added_feature_shap: "time_sale_roll_mean_nbhd_t0_w3_shap"
+  description: "Added feature to calculate neighborhood rolling averages of sale price"
   min_range: 5
   max_range: 95
   type: "continuous"

diff --git a/dvc.lock b/dvc.lock
@@ -5,20 +5,20 @@ stages:
     deps:
     - path: pipeline/00-ingest.R
       hash: md5
-      md5: 3fcf8cbce89340948f394825b1c78187
-      size: 23441
+      md5: 6aa03fd110dcc51b509585c22c7d7ff7
+      size: 24625
     params:
       params.yaml:
         assessment:
-          year: '2024'
-          date: '2024-01-01'
+          year: '2025'
+          date: '2025-01-01'
           triad: north
           group: residential
-          data_year: '2023'
+          data_year: '2024'
           working_year: '2025'
         input:
-          min_sale_year: '2015'
-          max_sale_year: '2023'
+          min_sale_year: '2016'
+          max_sale_year: '2024'
           n_years_prior: 4
           complex:
             match_exact:
@@ -38,28 +38,28 @@ stages:
     outs:
     - path: input/assessment_data.parquet
       hash: md5
-      md5: cecaf4aee89d2269bd059f536e611101
-      size: 425453415
+      md5: ba6f103742f30bba7da6759728fd8bb1
+      size: 415029836
     - path: input/char_data.parquet
       hash: md5
-      md5: 121f3017e46d8acd1f19ba689dca9726
-      size: 848066404
+      md5: f783f4629433bf1b5e8f1ef3033cba0a
+      size: 840256376
     - path: input/complex_id_data.parquet
       hash: md5
-      md5: 835be789fc9ef09f3bfa1d5c8465f6e6
-      size: 704175
+      md5: 892c2a64cd5f8f35f9a4c9608fa1464e
+      size: 701036
     - path: input/hie_data.parquet
       hash: md5
-      md5: ec600ed3e19b5b48059ce1d270b114b5
-      size: 1911086
+      md5: fcaa54bf22240ed6fcacd2b7b6a34941
+      size: 1924013
     - path: input/land_nbhd_rate_data.parquet
       hash: md5
-      md5: f3ec9627322bd271bf2957b7388aaa34
-      size: 3873
+      md5: 5fe80edfabdfac91efe888a25ee4051c
+      size: 6019
     - path: input/training_data.parquet
       hash: md5
-      md5: 3156fd30394ae3fb9eda7e0d0176ab2f
-      size: 208501951
+      md5: 18addc910a0ccbf7bb5b2fd9c7923f2d
+      size: 204978802
   train:
     cmd: Rscript pipeline/01-train.R
     deps:

diff --git a/params.yaml b/params.yaml
@@ -15,7 +15,7 @@ run_type: "test"
 
 # Note included with each run. Use this to summarize what changed about the run
 # or add context
-run_note: Preparing for 2025 model with 2024 data
+run_note: Preparing for 2025 model with 2024 data and updated sales
 
 toggle:
   # Should the train stage run full cross-validation? Otherwise, the model
@@ -39,10 +39,10 @@ toggle:
 # Assessment context and dates
 assessment:
   # Year of assessment. Used to pull land rates, HIEs, and other information
-  year: "2024"
+  year: "2025"
 
   # The statutorily set "sale date" for the purpose of prediction
-  date: "2024-01-01"
+  date: "2025-01-01"
 
   # Added context for model artifacts stored in S3. Also updates the triad
   # displayed in email notifications on model completion
@@ -51,7 +51,7 @@ assessment:
 
   # Year from which property characteristics are pulled. Usually lags the
   # assessment year by 1
-  data_year: "2023"
+  data_year: "2024"
 
   # Year used to partition data on S3. Working year in this case means
   # the year the Data Department is currently creating models for
@@ -60,8 +60,8 @@ assessment:
 # Parameters used to define the input/training data
 input:
   # The min and max year of sales to use for the training data sample
-  min_sale_year: "2015"
-  max_sale_year: "2023"
+  min_sale_year: "2016"
+  max_sale_year: "2024"
 
   # Number of years back to look for count_past_n_years feature
   n_years_prior: 4
@@ -138,7 +138,7 @@ model:
 
   # Parameters related to model determinism. Current settings should force
   # the same output every time if the same hyperparameters are used
-  seed: 2024
+  seed: 2025
   deterministic: true
   force_row_wise: true
 
@@ -386,10 +386,10 @@ pv:
 # (post-appeal) from the last reassessment and the most recent values from the
 # prior year
 ratio_study:
-  far_year: "2021"
+  far_year: "2022"
   far_stage: "board"
   far_column: "meta_2yr_pri_board_tot"
-  near_year: "2023"
+  near_year: "2024"
   near_stage: "certified"
   near_column: "meta_certified_tot"
 

diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R
@@ -32,7 +32,52 @@ AWS_ATHENA_CONN_NOCTUA <- dbConnect(
 
 
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-# 2. Pull Data -----------------------------------------------------------------
+# 2. Define Functions ----------------------------------------------------------
+#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+# Ingest-specific helper functions for data cleaning, etc.
+
+# Create a dictionary of column types, as specified in ccao::vars_dict
+col_type_dict <- ccao::vars_dict %>%
+  distinct(var_name = var_name_model, var_type = var_data_type) %>%
+  drop_na(var_name)
+
+# Mini-function to ensure that columns are the correct type
+recode_column_type <- function(col, col_name, dictionary = col_type_dict) {
+  col_type <- dictionary %>%
+    filter(var_name == col_name) %>%
+    pull(var_type)
+  switch(col_type,
+    numeric = as.numeric(col),
+    character = as.character(col),
+    logical = as.logical(as.numeric(col)),
+    categorical = as.factor(col),
+    date = lubridate::as_date(col)
+  )
+}
+
+# Mini function to deal with arrays
+# Some Athena columns are stored as arrays but are converted to string on
+# ingest. In such cases, we either keep the contents of the cell (if 1 unit),
+# collapse the array into a comma-separated string (if more than 1 unit),
+# or replace with NA if the array is empty
+process_array_column <- function(x) {
+  purrr::map_chr(x, function(cell) {
+    if (length(cell) > 1) {
+      paste(cell, collapse = ", ")
+    } else if (length(cell) == 1) {
+      as.character(cell) # Convert the single element to character
+    } else {
+      NA # Handle cases where the array is empty
+    }
+  })
+}
+
+
+
+
+#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+# 3. Pull Data -----------------------------------------------------------------
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 message("Pulling data from Athena")
 
@@ -68,7 +113,6 @@ training_data <- dbGetQuery(
   AND NOT sale.sale_filter_same_sale_within_365
   AND NOT sale.sale_filter_less_than_10k
   AND NOT sale.sale_filter_deed_type
-  AND Year(sale.sale_date) >= {params$input$min_sale_year}
   ")
 )
 tictoc::toc()
@@ -102,6 +146,60 @@ assessment_data <- dbGetQuery(
 )
 tictoc::toc()
 
+
+
+
+##### START TEMPORARY FIX FOR MISSING DATA. REMOVE ONCE 2024 DATA IS AVAILABLE
+library(data.table)
+conflict_prefer_all("dplyr", "data.table", quiet = TRUE)
+conflict_prefer_all("lubridate", "data.table", quiet = TRUE)
+fill_cols <- assessment_data %>%
+  select(
+    starts_with("loc_"),
+    starts_with("prox_"),
+    starts_with("acs5_"),
+    starts_with("other_"),
+    starts_with("shp_")
+  ) %>%
+  names()
+assessment_data_temp <- as.data.table(assessment_data) %>%
+  mutate(across(starts_with("loc_tax_"), process_array_column))
+assessment_data_temp_2024 <- assessment_data_temp[
+  meta_year == "2024",
+][
+  assessment_data_temp[meta_year == "2023"],
+  (fill_cols) := mget(paste0("i.", fill_cols)),
+  on = .(meta_pin, meta_card_num)
+]
+assessment_data <- rbind(
+  assessment_data_temp[meta_year != "2024"],
+  assessment_data_temp_2024
+) %>%
+  as_tibble()
+
+training_data_temp <- as.data.table(training_data) %>%
+  mutate(across(starts_with("loc_tax_"), process_array_column))
+training_data_temp_2024 <- training_data_temp[
+  meta_year == "2024",
+][
+  assessment_data_temp[meta_year == "2023"],
+  (fill_cols) := mget(paste0("i.", fill_cols)),
+  on = .(meta_pin, meta_card_num)
+]
+training_data <- rbind(
+  training_data_temp[meta_year != "2024"],
+  training_data_temp_2024
+) %>%
+  as_tibble()
+rm(
+  assessment_data_temp, assessment_data_temp_2024,
+  training_data_temp, training_data_temp_2024
+)
+##### END TEMPORARY FIX
+
+
+
+
 # Save both years for report generation using the characteristics
 assessment_data %>%
   write_parquet(paths$input$char$local)
@@ -128,58 +226,6 @@ rm(AWS_ATHENA_CONN_NOCTUA)
 
 
 
-#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-# 3. Define Functions ----------------------------------------------------------
-#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-# Ingest-specific helper functions for data cleaning, etc.
-
-# Create a dictionary of column types, as specified in ccao::vars_dict
-col_type_dict <- ccao::vars_dict %>%
-  distinct(var_name = var_name_model, var_type = var_data_type) %>%
-  drop_na(var_name)
-
-# Mini-function to ensure that columns are the correct type
-recode_column_type <- function(col, col_name, dictionary = col_type_dict) {
-  col_type <- dictionary %>%
-    filter(var_name == col_name) %>%
-    pull(var_type)
-  switch(col_type,
-    numeric = as.numeric(col),
-    character = as.character(col),
-    logical = as.logical(as.numeric(col)),
-    categorical = as.factor(col),
-    date = lubridate::as_date(col)
-  )
-}
-
-
-# Mini function to deal with arrays
-# Some Athena columns are stored as arrays but are converted to string on
-# ingest. In such cases, we either keep the contents of the cell (if 1 unit),
-# collapse the array into a comma-separated string (if more than 1 unit),
-# or replace with NA if the array is empty
-process_array_columns <- function(data, selector) {
-  data %>%
-    mutate(
-      across(
-        !!enquo(selector),
-        ~ sapply(.x, function(cell) {
-          if (length(cell) > 1) {
-            paste(cell, collapse = ", ")
-          } else if (length(cell) == 1) {
-            as.character(cell) # Convert the single element to character
-          } else {
-            NA # Handle cases where the array is empty
-          }
-        })
-      )
-    )
-}
-
-
-
-
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 # 4. Home Improvement Exemptions -----------------------------------------------
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -312,9 +358,8 @@ training_data_clean <- training_data_w_hie %>%
     ),
     char_ncu = ifelse(char_class == "212" & !is.na(char_ncu), char_ncu, 0)
   ) %>%
-  # Apply the helper function to process array columns
-  process_array_columns(starts_with("loc_tax_")) %>%
   mutate(
+    across(starts_with("loc_tax_"), process_array_column),
     loc_tax_municipality_name =
       replace_na(loc_tax_municipality_name, "UNINCORPORATED")
   ) %>%
@@ -428,8 +473,8 @@ assessment_data_clean <- assessment_data_w_hie %>%
     as_factor = FALSE
   ) %>%
   # Apply the helper function to process array columns
-  process_array_columns(starts_with("loc_tax_")) %>%
   mutate(
+    across(starts_with("loc_tax_"), process_array_column),
     loc_tax_municipality_name =
       replace_na(loc_tax_municipality_name, "UNINCORPORATED")
   ) %>%