From 10c8776aefd1663bf39020586d1269519228b40f Mon Sep 17 00:00:00 2001 From: Damon Bayer Date: Thu, 5 Dec 2024 18:14:35 -0600 Subject: [PATCH] Put data in a subdirectory (#192) --- .gitignore | 4 +- hewr/R/process_state_forecast.R | 6 +- pipelines/build_model.py | 2 +- pipelines/forecast_state.py | 2 +- pipelines/generate_epiweekly.R | 8 +- pipelines/prep_data.py | 12 +- pipelines/score_forecast.R | 1 + .../model_runs/TD/data.csv | 181 ------------------ .../model_runs/TD/data/data.tsv | 181 ++++++++++++++++++ .../TD/{ => data}/data_for_model_fit.json | 0 .../model_runs/TD/{ => data}/eval_data.tsv | 0 pipelines/timeseries_forecasts.R | 4 +- 12 files changed, 200 insertions(+), 201 deletions(-) delete mode 100644 pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data.csv create mode 100644 pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data/data.tsv rename pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/{ => data}/data_for_model_fit.json (100%) rename pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/{ => data}/eval_data.tsv (100%) diff --git a/.gitignore b/.gitignore index 9f92efd7..a3ae215c 100644 --- a/.gitignore +++ b/.gitignore @@ -399,5 +399,5 @@ private_data/* .vscode/settings.json # Test data exceptions to the general data exclusion -!pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data.csv -!pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/eval_data.tsv +!pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data/data.tsv +!pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data/eval_data.tsv diff --git a/hewr/R/process_state_forecast.R b/hewr/R/process_state_forecast.R index 55302fd2..9a42357a 100644 --- a/hewr/R/process_state_forecast.R +++ b/hewr/R/process_state_forecast.R @@ -9,10 +9,10 @@ process_state_forecast <- function(model_run_dir, save = TRUE) { disease_name_nssp <- parse_model_run_dir_path(model_run_dir)$disease - train_data_path <- fs::path(model_run_dir, "data", ext = "csv") - train_dat <- readr::read_csv(train_data_path, show_col_types = FALSE) + train_data_path <- fs::path(model_run_dir, "data", "data", ext = "tsv") + train_dat <- readr::read_tsv(train_data_path, show_col_types = FALSE) - eval_data_path <- fs::path(model_run_dir, "eval_data", ext = "tsv") + eval_data_path <- fs::path(model_run_dir, "data", "eval_data", ext = "tsv") eval_dat <- readr::read_tsv(eval_data_path, show_col_types = FALSE) |> dplyr::mutate(data_type = "eval") diff --git a/pipelines/build_model.py b/pipelines/build_model.py index af1d4dbb..5d062f38 100644 --- a/pipelines/build_model.py +++ b/pipelines/build_model.py @@ -8,7 +8,7 @@ def build_model_from_dir(model_dir): - data_path = model_dir / "data_for_model_fit.json" + data_path = model_dir / "data" / "data_for_model_fit.json" prior_path = model_dir / "priors.py" with open( diff --git a/pipelines/forecast_state.py b/pipelines/forecast_state.py index 2a150b52..1302ceb5 100644 --- a/pipelines/forecast_state.py +++ b/pipelines/forecast_state.py @@ -253,7 +253,7 @@ def main( first_training_date=first_training_date, last_training_date=last_training_date, latest_comprehensive_path=eval_data_path, - output_data_dir=model_run_dir, + output_data_dir=Path(model_run_dir, "data"), last_eval_date=report_date + timedelta(days=n_forecast_days), ) diff --git a/pipelines/generate_epiweekly.R b/pipelines/generate_epiweekly.R index b69fc043..932c1aee 100644 --- a/pipelines/generate_epiweekly.R +++ b/pipelines/generate_epiweekly.R @@ -31,7 +31,7 @@ purrr::walk(script_packages, \(pkg) { #' @return None. The function writes the epiweekly data to a CSV file in the #' specified directory. convert_daily_to_epiweekly <- function( - model_run_dir, dataname = "data.csv", + model_run_dir, dataname = "data.tsv", strict = TRUE, day_of_week = 7) { ext <- path_ext(dataname) data_basename <- path_ext_remove(dataname) @@ -42,7 +42,7 @@ convert_daily_to_epiweekly <- function( delim <- if (ext == "csv") "," else "\t" message(glue::glue("Generating epi-weekly data {model_run_dir}...")) - data_path <- path(model_run_dir, dataname) + data_path <- path(model_run_dir, "data", dataname) daily_data <- read_delim( data_path, @@ -73,7 +73,7 @@ convert_daily_to_epiweekly <- function( # epiweek end date determines data_type classification output_file <- path( - model_run_dir, + model_run_dir, "data", glue::glue("epiweekly_{data_basename}"), ext = ext ) @@ -82,7 +82,7 @@ convert_daily_to_epiweekly <- function( } main <- function(model_run_dir) { - convert_daily_to_epiweekly(model_run_dir, dataname = "data.csv") + convert_daily_to_epiweekly(model_run_dir, dataname = "data.tsv") convert_daily_to_epiweekly(model_run_dir, dataname = "eval_data.tsv") } diff --git a/pipelines/prep_data.py b/pipelines/prep_data.py index f1a74d94..ff892560 100644 --- a/pipelines/prep_data.py +++ b/pipelines/prep_data.py @@ -332,16 +332,14 @@ def process_and_save_state( "state_pop": state_pop, "right_truncation_offset": right_truncation_offset, } - - os.makedirs(model_run_dir, exist_ok=True) + data_dir = Path(model_run_dir, "data") + os.makedirs(data_dir, exist_ok=True) if logger is not None: - logger.info(f"Saving {state_abb} to {model_run_dir}") - data_to_save.write_csv(Path(model_run_dir, "data.csv")) + logger.info(f"Saving {state_abb} to {data_dir}") + data_to_save.write_csv(Path(data_dir, "data.tsv"), separator="\t") - with open( - Path(model_run_dir, "data_for_model_fit.json"), "w" - ) as json_file: + with open(Path(data_dir, "data_for_model_fit.json"), "w") as json_file: json.dump(data_for_model_fit, json_file) return None diff --git a/pipelines/score_forecast.R b/pipelines/score_forecast.R index 5384541b..3c9af194 100644 --- a/pipelines/score_forecast.R +++ b/pipelines/score_forecast.R @@ -200,6 +200,7 @@ read_and_score_location <- function(model_run_dir, ) truth_path <- fs::path(model_run_dir, + "data", eval_data_filename, ext = eval_data_file_ext ) diff --git a/pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data.csv b/pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data.csv deleted file mode 100644 index b92e5e21..00000000 --- a/pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data.csv +++ /dev/null @@ -1,181 +0,0 @@ -date,disease,ed_visits,data_type -2023-11-01,COVID-19,6,train -2023-11-02,COVID-19,6,train -2023-11-03,COVID-19,7,train -2023-11-04,COVID-19,10,train -2023-11-05,COVID-19,10,train -2023-11-06,COVID-19,12,train -2023-11-07,COVID-19,12,train -2023-11-08,COVID-19,10,train -2023-11-09,COVID-19,8,train -2023-11-10,COVID-19,15,train -2023-11-11,COVID-19,8,train -2023-11-12,COVID-19,9,train -2023-11-13,COVID-19,9,train -2023-11-14,COVID-19,13,train -2023-11-15,COVID-19,17,train -2023-11-16,COVID-19,7,train -2023-11-17,COVID-19,12,train -2023-11-18,COVID-19,10,train -2023-11-19,COVID-19,13,train -2023-11-20,COVID-19,10,train -2023-11-21,COVID-19,12,train -2023-11-22,COVID-19,15,train -2023-11-23,COVID-19,19,train -2023-11-24,COVID-19,19,train -2023-11-25,COVID-19,22,train -2023-11-26,COVID-19,17,train -2023-11-27,COVID-19,19,train -2023-11-28,COVID-19,14,train -2023-11-29,COVID-19,17,train -2023-11-30,COVID-19,19,train -2023-12-01,COVID-19,18,train -2023-12-02,COVID-19,13,train -2023-12-03,COVID-19,24,train -2023-12-04,COVID-19,21,train -2023-12-05,COVID-19,35,train -2023-12-06,COVID-19,26,train -2023-12-07,COVID-19,25,train -2023-12-08,COVID-19,30,train -2023-12-09,COVID-19,26,train -2023-12-10,COVID-19,20,train -2023-12-11,COVID-19,29,train -2023-12-12,COVID-19,38,train -2023-12-13,COVID-19,35,train -2023-12-14,COVID-19,41,train -2023-12-15,COVID-19,30,train -2023-12-16,COVID-19,37,train -2023-12-17,COVID-19,35,train -2023-12-18,COVID-19,46,train -2023-12-19,COVID-19,38,train -2023-12-20,COVID-19,23,train -2023-12-21,COVID-19,38,train -2023-12-22,COVID-19,22,train -2023-12-23,COVID-19,28,train -2023-12-24,COVID-19,23,train -2023-12-25,COVID-19,31,train -2023-12-26,COVID-19,19,train -2023-12-27,COVID-19,23,train -2023-12-28,COVID-19,17,train -2023-12-29,COVID-19,23,train -2023-12-30,COVID-19,26,train -2023-12-31,COVID-19,17,train -2024-01-01,COVID-19,17,train -2024-01-02,COVID-19,12,train -2024-01-03,COVID-19,13,train -2024-01-04,COVID-19,9,train -2024-01-05,COVID-19,22,train -2024-01-06,COVID-19,12,train -2024-01-07,COVID-19,13,train -2024-01-08,COVID-19,17,train -2024-01-09,COVID-19,14,train -2024-01-10,COVID-19,12,train -2024-01-11,COVID-19,6,train -2024-01-12,COVID-19,10,train -2024-01-13,COVID-19,10,train -2024-01-14,COVID-19,4,train -2024-01-15,COVID-19,12,train -2024-01-16,COVID-19,9,train -2024-01-17,COVID-19,8,train -2024-01-18,COVID-19,9,train -2024-01-19,COVID-19,8,train -2024-01-20,COVID-19,6,train -2024-01-21,COVID-19,13,train -2024-01-22,COVID-19,7,train -2024-01-23,COVID-19,8,train -2024-01-24,COVID-19,13,train -2024-01-25,COVID-19,9,train -2024-01-26,COVID-19,9,train -2024-01-27,COVID-19,17,train -2024-01-28,COVID-19,7,train -2024-01-29,COVID-19,10,train -2023-11-01,Total,105,train -2023-11-02,Total,105,train -2023-11-03,Total,104,train -2023-11-04,Total,109,train -2023-11-05,Total,105,train -2023-11-06,Total,126,train -2023-11-07,Total,118,train -2023-11-08,Total,99,train -2023-11-09,Total,119,train -2023-11-10,Total,115,train -2023-11-11,Total,106,train -2023-11-12,Total,123,train -2023-11-13,Total,104,train -2023-11-14,Total,124,train -2023-11-15,Total,102,train -2023-11-16,Total,102,train -2023-11-17,Total,130,train -2023-11-18,Total,126,train -2023-11-19,Total,112,train -2023-11-20,Total,97,train -2023-11-21,Total,109,train -2023-11-22,Total,107,train -2023-11-23,Total,102,train -2023-11-24,Total,120,train -2023-11-25,Total,125,train -2023-11-26,Total,109,train -2023-11-27,Total,110,train -2023-11-28,Total,100,train -2023-11-29,Total,118,train -2023-11-30,Total,128,train -2023-12-01,Total,123,train -2023-12-02,Total,113,train -2023-12-03,Total,122,train -2023-12-04,Total,121,train -2023-12-05,Total,151,train -2023-12-06,Total,136,train -2023-12-07,Total,140,train -2023-12-08,Total,120,train -2023-12-09,Total,143,train -2023-12-10,Total,123,train -2023-12-11,Total,121,train -2023-12-12,Total,152,train -2023-12-13,Total,125,train -2023-12-14,Total,138,train -2023-12-15,Total,153,train -2023-12-16,Total,134,train -2023-12-17,Total,134,train -2023-12-18,Total,137,train -2023-12-19,Total,149,train -2023-12-20,Total,111,train -2023-12-21,Total,135,train -2023-12-22,Total,121,train -2023-12-23,Total,129,train -2023-12-24,Total,117,train -2023-12-25,Total,147,train -2023-12-26,Total,118,train -2023-12-27,Total,128,train -2023-12-28,Total,118,train -2023-12-29,Total,140,train -2023-12-30,Total,119,train -2023-12-31,Total,110,train -2024-01-01,Total,105,train -2024-01-02,Total,108,train -2024-01-03,Total,112,train -2024-01-04,Total,94,train -2024-01-05,Total,103,train -2024-01-06,Total,111,train -2024-01-07,Total,120,train -2024-01-08,Total,126,train -2024-01-09,Total,120,train -2024-01-10,Total,124,train -2024-01-11,Total,101,train -2024-01-12,Total,128,train -2024-01-13,Total,114,train -2024-01-14,Total,102,train -2024-01-15,Total,97,train -2024-01-16,Total,89,train -2024-01-17,Total,112,train -2024-01-18,Total,116,train -2024-01-19,Total,109,train -2024-01-20,Total,97,train -2024-01-21,Total,115,train -2024-01-22,Total,118,train -2024-01-23,Total,117,train -2024-01-24,Total,106,train -2024-01-25,Total,102,train -2024-01-26,Total,100,train -2024-01-27,Total,126,train -2024-01-28,Total,116,train -2024-01-29,Total,116,train diff --git a/pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data/data.tsv b/pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data/data.tsv new file mode 100644 index 00000000..29f11011 --- /dev/null +++ b/pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data/data.tsv @@ -0,0 +1,181 @@ +date disease ed_visits data_type +2023-11-01 COVID-19 6 train +2023-11-02 COVID-19 6 train +2023-11-03 COVID-19 7 train +2023-11-04 COVID-19 10 train +2023-11-05 COVID-19 10 train +2023-11-06 COVID-19 12 train +2023-11-07 COVID-19 12 train +2023-11-08 COVID-19 10 train +2023-11-09 COVID-19 8 train +2023-11-10 COVID-19 15 train +2023-11-11 COVID-19 8 train +2023-11-12 COVID-19 9 train +2023-11-13 COVID-19 9 train +2023-11-14 COVID-19 13 train +2023-11-15 COVID-19 17 train +2023-11-16 COVID-19 7 train +2023-11-17 COVID-19 12 train +2023-11-18 COVID-19 10 train +2023-11-19 COVID-19 13 train +2023-11-20 COVID-19 10 train +2023-11-21 COVID-19 12 train +2023-11-22 COVID-19 15 train +2023-11-23 COVID-19 19 train +2023-11-24 COVID-19 19 train +2023-11-25 COVID-19 22 train +2023-11-26 COVID-19 17 train +2023-11-27 COVID-19 19 train +2023-11-28 COVID-19 14 train +2023-11-29 COVID-19 17 train +2023-11-30 COVID-19 19 train +2023-12-01 COVID-19 18 train +2023-12-02 COVID-19 13 train +2023-12-03 COVID-19 24 train +2023-12-04 COVID-19 21 train +2023-12-05 COVID-19 35 train +2023-12-06 COVID-19 26 train +2023-12-07 COVID-19 25 train +2023-12-08 COVID-19 30 train +2023-12-09 COVID-19 26 train +2023-12-10 COVID-19 20 train +2023-12-11 COVID-19 29 train +2023-12-12 COVID-19 38 train +2023-12-13 COVID-19 35 train +2023-12-14 COVID-19 41 train +2023-12-15 COVID-19 30 train +2023-12-16 COVID-19 37 train +2023-12-17 COVID-19 35 train +2023-12-18 COVID-19 46 train +2023-12-19 COVID-19 38 train +2023-12-20 COVID-19 23 train +2023-12-21 COVID-19 38 train +2023-12-22 COVID-19 22 train +2023-12-23 COVID-19 28 train +2023-12-24 COVID-19 23 train +2023-12-25 COVID-19 31 train +2023-12-26 COVID-19 19 train +2023-12-27 COVID-19 23 train +2023-12-28 COVID-19 17 train +2023-12-29 COVID-19 23 train +2023-12-30 COVID-19 26 train +2023-12-31 COVID-19 17 train +2024-01-01 COVID-19 17 train +2024-01-02 COVID-19 12 train +2024-01-03 COVID-19 13 train +2024-01-04 COVID-19 9 train +2024-01-05 COVID-19 22 train +2024-01-06 COVID-19 12 train +2024-01-07 COVID-19 13 train +2024-01-08 COVID-19 17 train +2024-01-09 COVID-19 14 train +2024-01-10 COVID-19 12 train +2024-01-11 COVID-19 6 train +2024-01-12 COVID-19 10 train +2024-01-13 COVID-19 10 train +2024-01-14 COVID-19 4 train +2024-01-15 COVID-19 12 train +2024-01-16 COVID-19 9 train +2024-01-17 COVID-19 8 train +2024-01-18 COVID-19 9 train +2024-01-19 COVID-19 8 train +2024-01-20 COVID-19 6 train +2024-01-21 COVID-19 13 train +2024-01-22 COVID-19 7 train +2024-01-23 COVID-19 8 train +2024-01-24 COVID-19 13 train +2024-01-25 COVID-19 9 train +2024-01-26 COVID-19 9 train +2024-01-27 COVID-19 17 train +2024-01-28 COVID-19 7 train +2024-01-29 COVID-19 10 train +2023-11-01 Total 105 train +2023-11-02 Total 105 train +2023-11-03 Total 104 train +2023-11-04 Total 109 train +2023-11-05 Total 105 train +2023-11-06 Total 126 train +2023-11-07 Total 118 train +2023-11-08 Total 99 train +2023-11-09 Total 119 train +2023-11-10 Total 115 train +2023-11-11 Total 106 train +2023-11-12 Total 123 train +2023-11-13 Total 104 train +2023-11-14 Total 124 train +2023-11-15 Total 102 train +2023-11-16 Total 102 train +2023-11-17 Total 130 train +2023-11-18 Total 126 train +2023-11-19 Total 112 train +2023-11-20 Total 97 train +2023-11-21 Total 109 train +2023-11-22 Total 107 train +2023-11-23 Total 102 train +2023-11-24 Total 120 train +2023-11-25 Total 125 train +2023-11-26 Total 109 train +2023-11-27 Total 110 train +2023-11-28 Total 100 train +2023-11-29 Total 118 train +2023-11-30 Total 128 train +2023-12-01 Total 123 train +2023-12-02 Total 113 train +2023-12-03 Total 122 train +2023-12-04 Total 121 train +2023-12-05 Total 151 train +2023-12-06 Total 136 train +2023-12-07 Total 140 train +2023-12-08 Total 120 train +2023-12-09 Total 143 train +2023-12-10 Total 123 train +2023-12-11 Total 121 train +2023-12-12 Total 152 train +2023-12-13 Total 125 train +2023-12-14 Total 138 train +2023-12-15 Total 153 train +2023-12-16 Total 134 train +2023-12-17 Total 134 train +2023-12-18 Total 137 train +2023-12-19 Total 149 train +2023-12-20 Total 111 train +2023-12-21 Total 135 train +2023-12-22 Total 121 train +2023-12-23 Total 129 train +2023-12-24 Total 117 train +2023-12-25 Total 147 train +2023-12-26 Total 118 train +2023-12-27 Total 128 train +2023-12-28 Total 118 train +2023-12-29 Total 140 train +2023-12-30 Total 119 train +2023-12-31 Total 110 train +2024-01-01 Total 105 train +2024-01-02 Total 108 train +2024-01-03 Total 112 train +2024-01-04 Total 94 train +2024-01-05 Total 103 train +2024-01-06 Total 111 train +2024-01-07 Total 120 train +2024-01-08 Total 126 train +2024-01-09 Total 120 train +2024-01-10 Total 124 train +2024-01-11 Total 101 train +2024-01-12 Total 128 train +2024-01-13 Total 114 train +2024-01-14 Total 102 train +2024-01-15 Total 97 train +2024-01-16 Total 89 train +2024-01-17 Total 112 train +2024-01-18 Total 116 train +2024-01-19 Total 109 train +2024-01-20 Total 97 train +2024-01-21 Total 115 train +2024-01-22 Total 118 train +2024-01-23 Total 117 train +2024-01-24 Total 106 train +2024-01-25 Total 102 train +2024-01-26 Total 100 train +2024-01-27 Total 126 train +2024-01-28 Total 116 train +2024-01-29 Total 116 train diff --git a/pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data_for_model_fit.json b/pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data/data_for_model_fit.json similarity index 100% rename from pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data_for_model_fit.json rename to pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data/data_for_model_fit.json diff --git a/pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/eval_data.tsv b/pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data/eval_data.tsv similarity index 100% rename from pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/eval_data.tsv rename to pipelines/tests/covid-19_r_2024-01-29_f_2023-11-01_t_2024-01-29/model_runs/TD/data/eval_data.tsv diff --git a/pipelines/timeseries_forecasts.R b/pipelines/timeseries_forecasts.R index 43436d37..8c0a2887 100644 --- a/pipelines/timeseries_forecasts.R +++ b/pipelines/timeseries_forecasts.R @@ -153,9 +153,9 @@ main <- function( data_frequency <- if_else(epiweekly, "1 week", "1 day") dataname <- if_else(epiweekly, "epiweekly_data", "data") # to do: do this with json data that has dates - data_path <- path(model_run_dir, dataname, ext = "csv") + data_path <- path(model_run_dir, "data", dataname, ext = "tsv") - target_and_other_data <- read_csv( + target_and_other_data <- read_tsv( data_path, col_types = cols( disease = col_character(),