Skip to content

Commit

Permalink
Update compare_clean_data, add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
shaunporwal committed Jan 22, 2025
1 parent 024e88f commit 780dd35
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 109 deletions.
196 changes: 89 additions & 107 deletions R/compare_clean_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
#' and generates outputs for numeric, factor, character, binary, date, and other data types.
#' The results are saved to an output directory, and optional views of the data can be displayed or saved.
#'
#' @param df_old_path Character. Path to the old dataset file (e.g., a CSV file).
#' @param df_new_path Character. Path to the new dataset file (e.g., a CSV file).
#' @param old_data Dataframe. The old dataset for comparison.
#' @param new_data Dataframe. The new dataset for comparison.
#' @param output_dir Character. Path to the directory where the output files will be saved.
#' @param final_vars_set Character vector. List of variable names to include in the comparison.
#' @param date_col Character. Name of the column in the datasets representing the date.
Expand All @@ -17,128 +17,110 @@
#' @return A list of dataframes containing the comparison results. Dataframes include numeric, factor, character, binary, date,
#' and other derived datasets. Entries with missing data are omitted.
#'
#' The function also saves CSV files of the comparison results to the specified output directory.
#' If `save_views` is TRUE, the views of the dataframes are saved as additional files in the output directory.
#'
#' @export
compare_clean_data <- function(
df_old_path,
df_new_path,
output_dir,
final_vars_set,
date_col,
limit_to_same_date = TRUE,
show_views = FALSE,
save_views = FALSE
old_data,
new_data,
output_dir,
final_vars_set,
date_col,
limit_to_same_date = TRUE,
show_views = FALSE,
save_views = FALSE
) {
# Read and select specific columns
old_data <- suppressWarnings(
read_raw_data(file = df_old_path) |> select(all_of(final_vars_set))
# Select specific columns
old_data <- old_data |> select(all_of(final_vars_set))
new_data <- new_data |> select(all_of(final_vars_set))

# Ensure the date column exists in the datasets
if (!date_col %in% names(old_data) | !date_col %in% names(new_data)) {
stop(paste("Date column", date_col, "not found in one or both datasets."))
}

# Code block to limit compare to similar years
max_filter_date <- max(old_data[[date_col]], na.rm = TRUE)
min_filter_date <- min(old_data[[date_col]], na.rm = TRUE)

if (limit_to_same_date) {
new_data <- suppressWarnings(
new_data |>
filter(.data[[date_col]] <= max_filter_date & .data[[date_col]] >= min_filter_date)
)
} else {
new_data <- suppressWarnings(
read_raw_data(file = df_new_path) |> select(all_of(final_vars_set))
new_data |>
filter(.data[[date_col]] >= min_filter_date)
)
}

# Ensure the date column exists in the datasets
if (!date_col %in% names(old_data) | !date_col %in% names(new_data)) {
stop(paste("Date column", date_col, "not found in one or both datasets."))
}
# Run Compare -------------------------------------------------------------
final_list <- suppressWarnings(
tryCatch(compare_df(old_data, new_data), error = function(e) NULL)
)

# Code block to limit compare to similar years
max_filter_date <- max(old_data[[date_col]], na.rm = TRUE)
min_filter_date <- min(old_data[[date_col]], na.rm = TRUE)
# Gracefully handle if final_list is NULL
if (is.null(final_list)) {
warning("Comparison failed. Returning empty object.")
return(list())
}

if (limit_to_same_date) {
new_data <- suppressWarnings(
new_data |>
filter(.data[[date_col]] <= max_filter_date & .data[[date_col]] >= min_filter_date)
)
} else {
new_data <- suppressWarnings(
new_data |>
filter(.data[[date_col]] >= min_filter_date)
)
}
# Use current date for file naming
current_date <- format(Sys.Date(), "%Y-%m-%d")

# Run Compare -------------------------------------------------------------
final_list <- suppressWarnings(
tryCatch(compare_df(old_data, new_data), error = function(e) NULL)
)
# Ensure output directory exists
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}

# Gracefully handle if final_list is NULL
if (is.null(final_list)) {
warning("Comparison failed. Returning empty object.")
return(list())
}
# Prepare object_list with error handling for missing indices
object_list <- list(
numeric = tryCatch(suppressWarnings(final_list[[1]] %>% select(-(matches("q|median")))), error = function(e) NULL),
factor = tryCatch(suppressWarnings(final_list[[2]]), error = function(e) NULL),
char = tryCatch(suppressWarnings(final_list[[3]]), error = function(e) NULL),
bin = tryCatch(suppressWarnings(final_list[[4]]), error = function(e) NULL),
date = tryCatch(suppressWarnings(final_list[[5]]), error = function(e) NULL),
surg = tryCatch(suppressWarnings(final_list[[6]]), error = function(e) NULL),
by_year_mean = tryCatch(suppressWarnings(final_list[[7]]), error = function(e) NULL),
by_year_na = tryCatch(suppressWarnings(final_list[[8]]), error = function(e) NULL)
)

# Extracting dates from file paths
extract_date <- function(path) {
suppressWarnings(
sub(".*secure_data/([0-9]{4}-[0-9]{2}-[0-9]{2}).*", "\\1", path)
purrr::iwalk(object_list, function(data, name) {
if (is.null(data)) {
message(paste("Skipping:", name, "due to missing data."))
} else {
tryCatch(
{
output_file <- file.path(output_dir, paste0("results_", name, "_", current_date, ".csv"))
suppressWarnings(utils::write.csv(data, file = output_file, row.names = FALSE))
message(paste("Saved:", output_file))
},
error = function(e) {
warning(paste("Skipping results for", name, "due to error:", e$message))
}
)
}
})

date1 <- extract_date(df_old_path) # Old data date
date2 <- extract_date(df_new_path) # New data date

# Ensure output directory exists
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}

# Prepare object_list with error handling for missing indices
object_list <- list(
numeric = tryCatch(suppressWarnings(final_list[[1]] %>% select(-(matches("q|median")))), error = function(e) NULL),
factor = tryCatch(suppressWarnings(final_list[[2]]), error = function(e) NULL),
char = tryCatch(suppressWarnings(final_list[[3]]), error = function(e) NULL),
bin = tryCatch(suppressWarnings(final_list[[4]]), error = function(e) NULL),
date = tryCatch(suppressWarnings(final_list[[5]]), error = function(e) NULL),
surg = tryCatch(suppressWarnings(final_list[[6]]), error = function(e) NULL),
by_year_mean = tryCatch(suppressWarnings(final_list[[7]]), error = function(e) NULL),
by_year_na = tryCatch(suppressWarnings(final_list[[8]]), error = function(e) NULL)
)

# Save views if requested
if (save_views) {
purrr::iwalk(object_list, function(data, name) {
if (is.null(data)) {
message(paste("Skipping:", name, "due to missing data."))
} else {
tryCatch(
{
output_file <- file.path(output_dir, paste0("results_", name, "_", date1, "_", date2, ".csv"))
suppressWarnings(utils::write.csv(data, file = output_file, row.names = FALSE))
message(paste("Saved:", output_file))
},
error = function(e) {
warning(paste("Skipping results for", name, "due to error:", e$message))
}
)
if (!is.null(data)) {
view_file <- file.path(output_dir, paste0("view_", name, "_", current_date, ".csv"))
suppressWarnings(utils::write.csv(data, file = view_file, row.names = FALSE))
message(paste("View saved:", view_file))
}
})
}

# Save views if requested
if (save_views) {
purrr::iwalk(object_list, function(data, name) {
if (!is.null(data)) {
view_file <- file.path(output_dir, paste0("view_", name, "_", date1, "_", date2, ".csv"))
suppressWarnings(utils::write.csv(data, file = view_file, row.names = FALSE))
message(paste("View saved:", view_file))
}
})
}

message("Processing complete. Files saved to:", output_dir, " - Date range: ", date1, " to ", date2)

# Display views if requested
if (show_views) {
purrr::iwalk(object_list, function(data, name) {
if (!is.null(data)) {
suppressWarnings(utils::View(data, title = paste0("View - ", name)))
}
})
} else {
message("No views displayed as show_views = FALSE")
}

# Return only non-null dataframes
return(purrr::compact(object_list))
# Display views if requested
if (show_views) {
purrr::iwalk(object_list, function(data, name) {
if (!is.null(data)) {
suppressWarnings(utils::View(data, title = paste0("View - ", name)))
}
})
}

# Return only non-null dataframes
return(purrr::compact(object_list))
}
60 changes: 58 additions & 2 deletions tests/testthat/test-compare_clean_data.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,59 @@
test_that("multiplication works", {
expect_equal(2 * 2, 4)
library(testthat)
library(dplyr)

test_that("compare_clean_data handles basic comparisons", {
dir.create("test_output", showWarnings = FALSE)

old_data <- data.frame(
visit_date = as.Date(c("2023-01-01", "2023-01-02")),
value = c(1, 2)
)
new_data <- old_data
new_data$value <- c(1.1, 2.1)

result <- suppressWarnings(compare_clean_data(
old_data, new_data, "test_output",
c("visit_date", "value"), "visit_date"
))

expect_type(result, "list")
unlink("test_output", recursive = TRUE)
})

test_that("compare_clean_data validates date column", {
dir.create("test_output", showWarnings = FALSE)
old_df <- data.frame(x = 1, y = 2)
new_df <- data.frame(x = 1, y = 2)

expect_error(
compare_clean_data(
old_df, new_df, "test_output",
c("x", "y"), "missing_date"
),
"Date column.*not found"
)

unlink("test_output", recursive = TRUE)
})

test_that("compare_clean_data handles date filtering", {
dir.create("test_output", showWarnings = FALSE)

old_data <- data.frame(
visit_date = as.Date(c("2023-01-01", "2023-01-02")),
value = c(1, 2)
)
new_data <- data.frame(
visit_date = as.Date(c("2023-01-01", "2023-01-02", "2023-01-03")),
value = c(1.1, 2.1, 3.1)
)

result <- suppressWarnings(compare_clean_data(
old_data, new_data, "test_output",
c("visit_date", "value"), "visit_date",
limit_to_same_date = TRUE
))

expect_type(result, "list")
unlink("test_output", recursive = TRUE)
})

0 comments on commit 780dd35

Please sign in to comment.