From 780dd3505af6c3fa509f8ce2783fba6a7bc1e20b Mon Sep 17 00:00:00 2001 From: PorwalS Date: Wed, 22 Jan 2025 16:47:12 -0500 Subject: [PATCH] Update compare_clean_data, add tests --- R/compare_clean_data.R | 196 ++++++++++------------- tests/testthat/test-compare_clean_data.R | 60 ++++++- 2 files changed, 147 insertions(+), 109 deletions(-) diff --git a/R/compare_clean_data.R b/R/compare_clean_data.R index 6a6b2bc..b987cac 100644 --- a/R/compare_clean_data.R +++ b/R/compare_clean_data.R @@ -4,8 +4,8 @@ #' and generates outputs for numeric, factor, character, binary, date, and other data types. #' The results are saved to an output directory, and optional views of the data can be displayed or saved. #' -#' @param df_old_path Character. Path to the old dataset file (e.g., a CSV file). -#' @param df_new_path Character. Path to the new dataset file (e.g., a CSV file). +#' @param old_data Dataframe. The old dataset for comparison. +#' @param new_data Dataframe. The new dataset for comparison. #' @param output_dir Character. Path to the directory where the output files will be saved. #' @param final_vars_set Character vector. List of variable names to include in the comparison. #' @param date_col Character. Name of the column in the datasets representing the date. @@ -17,128 +17,110 @@ #' @return A list of dataframes containing the comparison results. Dataframes include numeric, factor, character, binary, date, #' and other derived datasets. Entries with missing data are omitted. #' -#' The function also saves CSV files of the comparison results to the specified output directory. -#' If `save_views` is TRUE, the views of the dataframes are saved as additional files in the output directory. -#' #' @export compare_clean_data <- function( - df_old_path, - df_new_path, - output_dir, - final_vars_set, - date_col, - limit_to_same_date = TRUE, - show_views = FALSE, - save_views = FALSE + old_data, + new_data, + output_dir, + final_vars_set, + date_col, + limit_to_same_date = TRUE, + show_views = FALSE, + save_views = FALSE ) { - # Read and select specific columns - old_data <- suppressWarnings( - read_raw_data(file = df_old_path) |> select(all_of(final_vars_set)) +# Select specific columns +old_data <- old_data |> select(all_of(final_vars_set)) +new_data <- new_data |> select(all_of(final_vars_set)) + +# Ensure the date column exists in the datasets +if (!date_col %in% names(old_data) | !date_col %in% names(new_data)) { + stop(paste("Date column", date_col, "not found in one or both datasets.")) +} + +# Code block to limit compare to similar years +max_filter_date <- max(old_data[[date_col]], na.rm = TRUE) +min_filter_date <- min(old_data[[date_col]], na.rm = TRUE) + +if (limit_to_same_date) { + new_data <- suppressWarnings( + new_data |> + filter(.data[[date_col]] <= max_filter_date & .data[[date_col]] >= min_filter_date) ) +} else { new_data <- suppressWarnings( - read_raw_data(file = df_new_path) |> select(all_of(final_vars_set)) + new_data |> + filter(.data[[date_col]] >= min_filter_date) ) +} - # Ensure the date column exists in the datasets - if (!date_col %in% names(old_data) | !date_col %in% names(new_data)) { - stop(paste("Date column", date_col, "not found in one or both datasets.")) - } +# Run Compare ------------------------------------------------------------- +final_list <- suppressWarnings( + tryCatch(compare_df(old_data, new_data), error = function(e) NULL) +) - # Code block to limit compare to similar years - max_filter_date <- max(old_data[[date_col]], na.rm = TRUE) - min_filter_date <- min(old_data[[date_col]], na.rm = TRUE) +# Gracefully handle if final_list is NULL +if (is.null(final_list)) { + warning("Comparison failed. Returning empty object.") + return(list()) +} - if (limit_to_same_date) { - new_data <- suppressWarnings( - new_data |> - filter(.data[[date_col]] <= max_filter_date & .data[[date_col]] >= min_filter_date) - ) - } else { - new_data <- suppressWarnings( - new_data |> - filter(.data[[date_col]] >= min_filter_date) - ) - } +# Use current date for file naming +current_date <- format(Sys.Date(), "%Y-%m-%d") - # Run Compare ------------------------------------------------------------- - final_list <- suppressWarnings( - tryCatch(compare_df(old_data, new_data), error = function(e) NULL) - ) +# Ensure output directory exists +if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) +} - # Gracefully handle if final_list is NULL - if (is.null(final_list)) { - warning("Comparison failed. Returning empty object.") - return(list()) - } +# Prepare object_list with error handling for missing indices +object_list <- list( + numeric = tryCatch(suppressWarnings(final_list[[1]] %>% select(-(matches("q|median")))), error = function(e) NULL), + factor = tryCatch(suppressWarnings(final_list[[2]]), error = function(e) NULL), + char = tryCatch(suppressWarnings(final_list[[3]]), error = function(e) NULL), + bin = tryCatch(suppressWarnings(final_list[[4]]), error = function(e) NULL), + date = tryCatch(suppressWarnings(final_list[[5]]), error = function(e) NULL), + surg = tryCatch(suppressWarnings(final_list[[6]]), error = function(e) NULL), + by_year_mean = tryCatch(suppressWarnings(final_list[[7]]), error = function(e) NULL), + by_year_na = tryCatch(suppressWarnings(final_list[[8]]), error = function(e) NULL) +) - # Extracting dates from file paths - extract_date <- function(path) { - suppressWarnings( - sub(".*secure_data/([0-9]{4}-[0-9]{2}-[0-9]{2}).*", "\\1", path) +purrr::iwalk(object_list, function(data, name) { + if (is.null(data)) { + message(paste("Skipping:", name, "due to missing data.")) + } else { + tryCatch( + { + output_file <- file.path(output_dir, paste0("results_", name, "_", current_date, ".csv")) + suppressWarnings(utils::write.csv(data, file = output_file, row.names = FALSE)) + message(paste("Saved:", output_file)) + }, + error = function(e) { + warning(paste("Skipping results for", name, "due to error:", e$message)) + } ) } +}) - date1 <- extract_date(df_old_path) # Old data date - date2 <- extract_date(df_new_path) # New data date - - # Ensure output directory exists - if (!dir.exists(output_dir)) { - dir.create(output_dir, recursive = TRUE) - } - - # Prepare object_list with error handling for missing indices - object_list <- list( - numeric = tryCatch(suppressWarnings(final_list[[1]] %>% select(-(matches("q|median")))), error = function(e) NULL), - factor = tryCatch(suppressWarnings(final_list[[2]]), error = function(e) NULL), - char = tryCatch(suppressWarnings(final_list[[3]]), error = function(e) NULL), - bin = tryCatch(suppressWarnings(final_list[[4]]), error = function(e) NULL), - date = tryCatch(suppressWarnings(final_list[[5]]), error = function(e) NULL), - surg = tryCatch(suppressWarnings(final_list[[6]]), error = function(e) NULL), - by_year_mean = tryCatch(suppressWarnings(final_list[[7]]), error = function(e) NULL), - by_year_na = tryCatch(suppressWarnings(final_list[[8]]), error = function(e) NULL) - ) - +# Save views if requested +if (save_views) { purrr::iwalk(object_list, function(data, name) { - if (is.null(data)) { - message(paste("Skipping:", name, "due to missing data.")) - } else { - tryCatch( - { - output_file <- file.path(output_dir, paste0("results_", name, "_", date1, "_", date2, ".csv")) - suppressWarnings(utils::write.csv(data, file = output_file, row.names = FALSE)) - message(paste("Saved:", output_file)) - }, - error = function(e) { - warning(paste("Skipping results for", name, "due to error:", e$message)) - } - ) + if (!is.null(data)) { + view_file <- file.path(output_dir, paste0("view_", name, "_", current_date, ".csv")) + suppressWarnings(utils::write.csv(data, file = view_file, row.names = FALSE)) + message(paste("View saved:", view_file)) } }) +} - # Save views if requested - if (save_views) { - purrr::iwalk(object_list, function(data, name) { - if (!is.null(data)) { - view_file <- file.path(output_dir, paste0("view_", name, "_", date1, "_", date2, ".csv")) - suppressWarnings(utils::write.csv(data, file = view_file, row.names = FALSE)) - message(paste("View saved:", view_file)) - } - }) - } - - message("Processing complete. Files saved to:", output_dir, " - Date range: ", date1, " to ", date2) - - # Display views if requested - if (show_views) { - purrr::iwalk(object_list, function(data, name) { - if (!is.null(data)) { - suppressWarnings(utils::View(data, title = paste0("View - ", name))) - } - }) - } else { - message("No views displayed as show_views = FALSE") - } - - # Return only non-null dataframes - return(purrr::compact(object_list)) +# Display views if requested +if (show_views) { + purrr::iwalk(object_list, function(data, name) { + if (!is.null(data)) { + suppressWarnings(utils::View(data, title = paste0("View - ", name))) + } + }) } + +# Return only non-null dataframes +return(purrr::compact(object_list)) +} \ No newline at end of file diff --git a/tests/testthat/test-compare_clean_data.R b/tests/testthat/test-compare_clean_data.R index 8849056..0709802 100644 --- a/tests/testthat/test-compare_clean_data.R +++ b/tests/testthat/test-compare_clean_data.R @@ -1,3 +1,59 @@ -test_that("multiplication works", { - expect_equal(2 * 2, 4) +library(testthat) +library(dplyr) + +test_that("compare_clean_data handles basic comparisons", { + dir.create("test_output", showWarnings = FALSE) + + old_data <- data.frame( + visit_date = as.Date(c("2023-01-01", "2023-01-02")), + value = c(1, 2) + ) + new_data <- old_data + new_data$value <- c(1.1, 2.1) + + result <- suppressWarnings(compare_clean_data( + old_data, new_data, "test_output", + c("visit_date", "value"), "visit_date" + )) + + expect_type(result, "list") + unlink("test_output", recursive = TRUE) }) + +test_that("compare_clean_data validates date column", { + dir.create("test_output", showWarnings = FALSE) + old_df <- data.frame(x = 1, y = 2) + new_df <- data.frame(x = 1, y = 2) + + expect_error( + compare_clean_data( + old_df, new_df, "test_output", + c("x", "y"), "missing_date" + ), + "Date column.*not found" + ) + + unlink("test_output", recursive = TRUE) +}) + +test_that("compare_clean_data handles date filtering", { + dir.create("test_output", showWarnings = FALSE) + + old_data <- data.frame( + visit_date = as.Date(c("2023-01-01", "2023-01-02")), + value = c(1, 2) + ) + new_data <- data.frame( + visit_date = as.Date(c("2023-01-01", "2023-01-02", "2023-01-03")), + value = c(1.1, 2.1, 3.1) + ) + + result <- suppressWarnings(compare_clean_data( + old_data, new_data, "test_output", + c("visit_date", "value"), "visit_date", + limit_to_same_date = TRUE + )) + + expect_type(result, "list") + unlink("test_output", recursive = TRUE) +}) \ No newline at end of file