Skip to content

Commit

Permalink
Update compare_columns func, tests, fix issues
Browse files Browse the repository at this point in the history
  • Loading branch information
shaunporwal committed Jan 22, 2025
1 parent 1c228ac commit 90d9f5f
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 88 deletions.
164 changes: 98 additions & 66 deletions R/compare_columns.R
Original file line number Diff line number Diff line change
@@ -1,81 +1,113 @@
#' Compare and Analyze Two Data Frames
#' Compare Column Names Between Two Objects
#'
#' @param df1 A data frame or tibble
#' @param df2 A data frame or tibble
#' @param df1_name Character string naming the first data frame (default: "df1")
#' @param df2_name Character string naming the second data frame (default: "df2")
#' @description
#' Compares column names between two objects (data frames or vectors of column names)
#' and provides a detailed summary of unique and shared columns
#'
#' @param obj1 A data frame, tibble, or vector of column names
#' @param obj2 A data frame, tibble, or vector of column names
#' @param obj1_name Character string naming the first object (default: "obj1")
#' @param obj2_name Character string naming the second object (default: "obj2")
#' @param group_by_col Character string specifying column name for grouping (default: NULL)
#'
#' @return A list containing comparison results and filtered dataframes
#' @return A list containing:
#' \itemize{
#' \item filtered_objects: List of objects containing unique columns
#' \item summary_data: List of comparison statistics
#' \item tables: GT tables showing comparison results
#' }
#' @importFrom gt gt tab_header cols_label fmt_number tab_style cell_borders px cells_body
#' @importFrom rlang .data
#' @importFrom rlang %||%
#' @export
compare_columns <- function(df1, df2, df1_name = "df1", df2_name = "df2", group_by_col = NULL) {
# Input validation
if (!is.data.frame(df1) || !is.data.frame(df2)) {
stop("Both inputs must be data frames or tibbles")
#'
#' @examples
#' # Compare column names of two data frames
#' compare_columns(mtcars, iris)
#'
#' # Compare vectors of column names
#' compare_columns(names(mtcars), names(iris))
compare_columns <- function(obj1, obj2, obj1_name = "obj1", obj2_name = "obj2", group_by_col = NULL) {
# Validate inputs first
# Update validation at start of function
if (!is.data.frame(obj1) && (!is.vector(obj1) || is.list(obj1))) {
stop("obj1 must be a data frame, tibble, or vector of column names")
}
if (!is.data.frame(obj2) && (!is.vector(obj2) || is.list(obj2))) {
stop("obj2 must be a data frame, tibble, or vector of column names")
}

# Special handling for vectors of names
if (is.vector(obj1) && !is.data.frame(obj1)) {
obj1_names <- obj1
obj1 <- data.frame(matrix(ncol = length(obj1)))
names(obj1) <- obj1_names
}
if (is.vector(obj2) && !is.data.frame(obj2)) {
obj2_names <- obj2
obj2 <- data.frame(matrix(ncol = length(obj2)))
names(obj2) <- obj2_names
}
if (!is.character(df1_name) || !is.character(df2_name)) {
stop("Data frame names must be character strings")

if (!is.character(obj1_name) || !is.character(obj2_name)) {
stop("Object names must be character strings")
}

# Calculate comparisons
cols_unique_to_df1 <- setdiff(names(df1), names(df2))
cols_unique_to_df2 <- setdiff(names(df2), names(df1))
mutual_cols <- intersect(names(df1), names(df2))
# Create filtered dataframes
df1_filtered <- df1[, cols_unique_to_df1, drop = FALSE]
df2_filtered <- df2[, cols_unique_to_df2, drop = FALSE]
cols_unique_to_obj1 <- setdiff(names(obj1), names(obj2))
cols_unique_to_obj2 <- setdiff(names(obj2), names(obj1))
mutual_cols <- intersect(names(obj1), names(obj2))

# Create filtered objects
obj1_filtered <- obj1[, cols_unique_to_obj1, drop = FALSE]
obj2_filtered <- obj2[, cols_unique_to_obj2, drop = FALSE]

# Create summary dataframe for gt
summary_df <- data.frame(
Metric = c("Total Columns", "Unique Columns", "Total Rows", "Unique Rows", "Mutual Columns"),
df1_value = c(
ncol(df1),
length(cols_unique_to_df1),
nrow(df1),
nrow(unique(df1)),
obj1_value = c(
ncol(obj1),
length(cols_unique_to_obj1),
nrow(obj1),
nrow(unique(obj1)),
length(mutual_cols)
),
df2_value = c(
ncol(df2),
length(cols_unique_to_df2),
nrow(df2),
nrow(unique(df2)),
NA # NA for mutual columns in df2
obj2_value = c(
ncol(obj2),
length(cols_unique_to_obj2),
nrow(obj2),
nrow(unique(obj2)),
NA # NA for mutual columns in obj2
)
)

# Create columns dataframe for gt
cols_df <- data.frame(
Category = c(
paste("Unique to", df1_name),
paste("Unique to", df2_name),
paste("Unique to", obj1_name),
paste("Unique to", obj2_name),
"Mutual Columns"
),
Columns = c(
paste(cols_unique_to_df1, collapse = ", "),
paste(cols_unique_to_df2, collapse = ", "),
paste(cols_unique_to_obj1, collapse = ", "),
paste(cols_unique_to_obj2, collapse = ", "),
paste(mutual_cols, collapse = ", ")
)
)

# Create and print summary table
summary_table <- summary_df |>
gt() |>
tab_header(
title = "DataFrame Comparison Summary"
title = "Object Comparison Summary"
) |>
cols_label(
Metric = "Metric",
df1_value = df1_name,
df2_value = df2_name
obj1_value = obj1_name,
obj2_value = obj2_name
) |>
fmt_number(
columns = c(.data$df1_value, .data$df2_value),
columns = c(.data$obj1_value, .data$obj2_value),
decimals = 0
) |>
tab_style(
Expand All @@ -86,7 +118,7 @@ compare_columns <- function(df1, df2, df1_name = "df1", df2_name = "df2", group_
),
locations = cells_body()
)

# Create and print columns table
cols_table <- cols_df |>
gt() |>
Expand All @@ -101,51 +133,51 @@ compare_columns <- function(df1, df2, df1_name = "df1", df2_name = "df2", group_
),
locations = cells_body()
)

# Print tables
print(summary_table)
print(cols_table)

# Store results
summary_data <- list(
unique_cols = list(
df1 = cols_unique_to_df1,
df2 = cols_unique_to_df2
obj1 = cols_unique_to_obj1,
obj2 = cols_unique_to_obj2
),
total_cols = list(
df1 = ncol(df1),
df2 = ncol(df2)
obj1 = ncol(obj1),
obj2 = ncol(obj2)
),
mutual_cols = mutual_cols,
row_counts = list(
df1 = nrow(df1),
df2 = nrow(df2)
obj1 = nrow(obj1),
obj2 = nrow(obj2)
),
unique_rows = list(
df1 = nrow(unique(df1)),
df2 = nrow(unique(df2))
obj1 = nrow(unique(obj1)),
obj2 = nrow(unique(obj2))
)
)

# Return list with all data
results <- list(
filtered_dfs = list(
df1 = df1_filtered,
df2 = df2_filtered
filtered_objects = list(
obj1 = obj1_filtered,
obj2 = obj2_filtered
),
summary_data = summary_data,
tables = list(
summary_table = summary_table,
cols_table = cols_table
)
)

# Add names for easier access
names(results$filtered_dfs) <- c(df1_name, df2_name)
names(results$summary_data$unique_cols) <- c(df1_name, df2_name)
names(results$summary_data$total_cols) <- c(df1_name, df2_name)
names(results$summary_data$row_counts) <- c(df1_name, df2_name)
names(results$summary_data$unique_rows) <- c(df1_name, df2_name)
names(results$filtered_objects) <- c(obj1_name, obj2_name)
names(results$summary_data$unique_cols) <- c(obj1_name, obj2_name)
names(results$summary_data$total_cols) <- c(obj1_name, obj2_name)
names(results$summary_data$row_counts) <- c(obj1_name, obj2_name)
names(results$summary_data$unique_rows) <- c(obj1_name, obj2_name)

return(invisible(results))
}
}
35 changes: 24 additions & 11 deletions man/compare_columns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

46 changes: 35 additions & 11 deletions tests/testthat/test-compare_columns.R
Original file line number Diff line number Diff line change
@@ -1,24 +1,37 @@
test_that("compare_columns handles basic comparison correctly", {
test_that("compare_columns handles basic dataframe comparison correctly", {
df1 <- data.frame(a = 1:3, b = 2:4, c = 3:5)
df2 <- data.frame(b = 2:4, c = 3:5, d = 4:6)

result <- compare_columns(df1, df2, "DF1", "DF2")

# Test core functionality
expect_equal(result$summary_data$unique_cols$DF1, "a")
expect_equal(result$summary_data$unique_cols$DF2, "d")
expect_equal(result$summary_data$mutual_cols, c("b", "c"))
expect_equal(result$summary_data$total_cols$DF1, 3)
expect_equal(result$summary_data$total_cols$DF2, 3)
})

test_that("compare_columns handles vector inputs correctly", {
vec1 <- c("a", "b", "c")
vec2 <- c("b", "c", "d")

result <- compare_columns(vec1, vec2, "VEC1", "VEC2")

expect_equal(result$summary_data$unique_cols$VEC1, "a")
expect_equal(result$summary_data$unique_cols$VEC2, "d")
expect_equal(result$summary_data$mutual_cols, c("b", "c"))
expect_equal(result$summary_data$total_cols$VEC1, 3)
})

test_that("compare_columns validates input correctly", {
df1 <- data.frame(a = 1:3)
not_df <- list(a = 1:3)
not_df <- list(a = 1:3) # Neither data.frame nor vector
null_input <- NULL

# Test input validation
expect_error(compare_columns(df1, not_df), "Both inputs must be data frames")
expect_error(compare_columns(df1, df1, 1, "df2"), "Data frame names must be character")
expect_error(compare_columns(df1, not_df), "obj2 must be a data frame, tibble, or vector")
expect_error(compare_columns(not_df, df1), "obj1 must be a data frame, tibble, or vector")
expect_error(compare_columns(null_input, df1), "obj1 must be a data frame, tibble, or vector")
expect_error(compare_columns(df1, df1, 1, "obj2"), "Object names must be character strings")
})

test_that("compare_columns handles empty dataframes", {
Expand All @@ -27,9 +40,9 @@ test_that("compare_columns handles empty dataframes", {

result <- compare_columns(df1, df2)

expect_equal(result$summary_data$row_counts$df1, 0)
expect_equal(result$summary_data$row_counts$df2, 0)
expect_equal(result$summary_data$unique_cols$df1, "a")
expect_equal(result$summary_data$row_counts$obj1, 0)
expect_equal(result$summary_data$row_counts$obj2, 0)
expect_equal(result$summary_data$unique_cols$obj1, "a")
})

test_that("compare_columns identifies duplicate rows correctly", {
Expand All @@ -38,6 +51,17 @@ test_that("compare_columns identifies duplicate rows correctly", {

result <- compare_columns(df1, df2)

expect_equal(result$summary_data$unique_rows$df1, 2)
expect_equal(result$summary_data$unique_rows$df2, 2)
expect_equal(result$summary_data$unique_rows$obj1, 2)
expect_equal(result$summary_data$unique_rows$obj2, 2)
})

test_that("compare_columns handles mixed input types", {
df1 <- data.frame(a = 1:3, b = 2:4)
vec2 <- c("b", "c")

result <- compare_columns(df1, vec2)

expect_equal(result$summary_data$unique_cols$obj1, "a")
expect_equal(result$summary_data$unique_cols$obj2, "c")
expect_equal(result$summary_data$mutual_cols, "b")
})

0 comments on commit 90d9f5f

Please sign in to comment.