Skip to content

Commit

Permalink
Update compare_columns function and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
shaunporwal committed Jan 21, 2025
1 parent e1894c2 commit aaec8c0
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 100 deletions.
6 changes: 5 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@ RoxygenNote: 7.3.2
Suggests:
knitr,
rmarkdown,
testthat (>= 3.0.0)
testthat (>= 3.0.0),
tibble
Config/testthat/edition: 3
Imports:
dplyr,
gt,
janitor,
lubridate,
magrittr,
purrr,
rlang,
stats,
stringr,
tidyr
Expand Down
11 changes: 11 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(check_duplicates)
export(compare_clean_data)
export(compare_columns)
export(compare_df)
export(from_win_to_mac)
export(get_sql)
Expand All @@ -10,4 +11,14 @@ export(parse_function)
export(read_raw_data)
export(to_sql_query)
import(dplyr)
importFrom(gt,cell_borders)
importFrom(gt,cells_body)
importFrom(gt,cols_label)
importFrom(gt,fmt_number)
importFrom(gt,gt)
importFrom(gt,px)
importFrom(gt,tab_header)
importFrom(gt,tab_style)
importFrom(magrittr,"%>%")
importFrom(rlang,.data)
importFrom(stats,na.omit)
153 changes: 127 additions & 26 deletions R/compare_columns.R
Original file line number Diff line number Diff line change
@@ -1,26 +1,17 @@
#' Compare Column Names Between Two Data Frames
#'
#' This function identifies the differences in column names between two data frames,
#' showing which columns are unique to each data frame.
#' Compare and Analyze Two Data Frames
#'
#' @param df1 A data frame or tibble
#' @param df2 A data frame or tibble
#' @param df1_name Character string naming the first data frame (default: "df1")
#' @param df2_name Character string naming the second data frame (default: "df2")
#' @param group_by_col Character string specifying column name for grouping (default: NULL)
#'
#' @return A list with two character vectors:
#' \itemize{
#' \item `unique_to_df1`: Column names present only in df1
#' \item `unique_to_df2`: Column names present only in df2
#' }
#'
#' @examples
#' df1 <- data.frame(a = 1, b = 2, c = 3)
#' df2 <- data.frame(b = 2, c = 3, d = 4)
#' compare_columns(df1, df2, "first_df", "second_df")
#'
#' @return A list containing comparison results and filtered dataframes
#' @importFrom gt gt tab_header cols_label fmt_number tab_style cell_borders px cells_body
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @export
compare_columns <- function(df1, df2, df1_name = "df1", df2_name = "df2") {
compare_columns <- function(df1, df2, df1_name = "df1", df2_name = "df2", group_by_col = NULL) {
# Input validation
if (!is.data.frame(df1) || !is.data.frame(df2)) {
stop("Both inputs must be data frames or tibbles")
Expand All @@ -30,21 +21,131 @@ compare_columns <- function(df1, df2, df1_name = "df1", df2_name = "df2") {
stop("Data frame names must be character strings")
}

# Find unique columns in each dataframe
# Calculate comparisons
cols_unique_to_df1 <- setdiff(names(df1), names(df2))
cols_unique_to_df2 <- setdiff(names(df2), names(df1))
mutual_cols <- intersect(names(df1), names(df2))

# Create filtered dataframes
df1_filtered <- df1[, cols_unique_to_df1, drop = FALSE]
df2_filtered <- df2[, cols_unique_to_df2, drop = FALSE]

# Create summary dataframe for gt
summary_df <- data.frame(
Metric = c("Total Columns", "Unique Columns", "Total Rows", "Unique Rows", "Mutual Columns"),
df1_value = c(
ncol(df1),
length(cols_unique_to_df1),
nrow(df1),
nrow(unique(df1)),
length(mutual_cols)
),
df2_value = c(
ncol(df2),
length(cols_unique_to_df2),
nrow(df2),
nrow(unique(df2)),
NA # NA for mutual columns in df2
)
)

# Create return list
result <- list(
unique_to_df1 = cols_unique_to_df1,
unique_to_df2 = cols_unique_to_df2
# Create columns dataframe for gt
cols_df <- data.frame(
Category = c(
paste("Unique to", df1_name),
paste("Unique to", df2_name),
"Mutual Columns"
),
Columns = c(
paste(cols_unique_to_df1, collapse = ", "),
paste(cols_unique_to_df2, collapse = ", "),
paste(mutual_cols, collapse = ", ")
)
)

# Add names attribute for clarity
names(result) <- c(
paste0("unique_to_", df1_name),
paste0("unique_to_", df2_name)
# Create and print summary table
summary_table <- summary_df %>%
gt() %>%
tab_header(
title = "DataFrame Comparison Summary"
) %>%
cols_label(
Metric = "Metric",
df1_value = df1_name,
df2_value = df2_name
) %>%
fmt_number(
columns = c(.data$df1_value, .data$df2_value),
decimals = 0
) %>%
tab_style(
style = cell_borders(
sides = c("top", "bottom"),
color = "gray85",
weight = px(1)
),
locations = cells_body()
)

# Create and print columns table
cols_table <- cols_df %>%
gt() %>%
tab_header(
title = "Column Details"
) %>%
tab_style(
style = cell_borders(
sides = c("top", "bottom"),
color = "gray85",
weight = px(1)
),
locations = cells_body()
)

# Print tables
print(summary_table)
print(cols_table)

# Store results
summary_data <- list(
unique_cols = list(
df1 = cols_unique_to_df1,
df2 = cols_unique_to_df2
),
total_cols = list(
df1 = ncol(df1),
df2 = ncol(df2)
),
mutual_cols = mutual_cols,
row_counts = list(
df1 = nrow(df1),
df2 = nrow(df2)
),
unique_rows = list(
df1 = nrow(unique(df1)),
df2 = nrow(unique(df2))
)
)

return(result)
# Return list with all data
results <- list(
filtered_dfs = list(
df1 = df1_filtered,
df2 = df2_filtered
),
summary_data = summary_data,
tables = list(
summary_table = summary_table,
cols_table = cols_table
)
)

# Add names for easier access
names(results$filtered_dfs) <- c(df1_name, df2_name)
names(results$summary_data$unique_cols) <- c(df1_name, df2_name)
names(results$summary_data$total_cols) <- c(df1_name, df2_name)
names(results$summary_data$row_counts) <- c(df1_name, df2_name)
names(results$summary_data$unique_rows) <- c(df1_name, df2_name)

return(invisible(results))
}
31 changes: 31 additions & 0 deletions man/compare_columns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

103 changes: 30 additions & 73 deletions tests/testthat/test-compare_columns.R
Original file line number Diff line number Diff line change
@@ -1,86 +1,43 @@
# tests/testthat/test-compare_columns.R

test_that("compare_columns identifies differences correctly", {
# Basic test with different columns
df1 <- data.frame(a = 1, b = 2, c = 3)
df2 <- data.frame(b = 2, c = 3, d = 4)
result <- compare_columns(df1, df2)

expect_equal(result$unique_to_df1, "a")
expect_equal(result$unique_to_df2, "d")

# Test with identical dataframes
df3 <- data.frame(x = 1, y = 2)
df4 <- data.frame(x = 3, y = 4)
result2 <- compare_columns(df3, df4)

expect_equal(length(result2$unique_to_df1), 0)
expect_equal(length(result2$unique_to_df2), 0)

# Test with completely different columns
df5 <- data.frame(a = 1, b = 2)
df6 <- data.frame(c = 3, d = 4)
result3 <- compare_columns(df5, df6)

expect_equal(result3$unique_to_df1, c("a", "b"))
expect_equal(result3$unique_to_df2, c("c", "d"))
test_that("compare_columns handles basic comparison correctly", {
df1 <- data.frame(a = 1:3, b = 2:4, c = 3:5)
df2 <- data.frame(b = 2:4, c = 3:5, d = 4:6)

result <- compare_columns(df1, df2, "DF1", "DF2")

# Test core functionality
expect_equal(result$summary_data$unique_cols$DF1, "a")
expect_equal(result$summary_data$unique_cols$DF2, "d")
expect_equal(result$summary_data$mutual_cols, c("b", "c"))
expect_equal(result$summary_data$total_cols$DF1, 3)
expect_equal(result$summary_data$total_cols$DF2, 3)
})

test_that("compare_columns handles custom names correctly", {
df1 <- data.frame(a = 1, b = 2)
df2 <- data.frame(b = 2, c = 3)
test_that("compare_columns validates input correctly", {
df1 <- data.frame(a = 1:3)
not_df <- list(a = 1:3)

result <- compare_columns(df1, df2, "first", "second")

# Test custom naming
expect_equal(names(result), c("unique_to_first", "unique_to_second"))
expect_equal(result$unique_to_first, "a")
expect_equal(result$unique_to_second, "c")
# Test input validation
expect_error(compare_columns(df1, not_df), "Both inputs must be data frames")
expect_error(compare_columns(df1, df1, 1, "df2"), "Data frame names must be character")
})

test_that("compare_columns handles empty dataframes", {
df1 <- data.frame()
df2 <- data.frame(a = 1)

result1 <- compare_columns(df1, df2)
expect_equal(length(result1$unique_to_df1), 0)
expect_equal(result1$unique_to_df2, "a")
df1 <- data.frame(a = numeric(0))
df2 <- data.frame(b = numeric(0))

result2 <- compare_columns(df2, df1)
expect_equal(result2$unique_to_df1, "a")
expect_equal(length(result2$unique_to_df2), 0)
})

test_that("compare_columns validates inputs correctly", {
df1 <- data.frame(a = 1)
not_df <- list(a = 1)

# Test invalid dataframe inputs
expect_error(compare_columns(not_df, df1), "Both inputs must be data frames or tibbles")
expect_error(compare_columns(df1, not_df), "Both inputs must be data frames or tibbles")

# Test invalid name inputs
expect_error(compare_columns(df1, df1, 1, "second"), "Data frame names must be character strings")
expect_error(compare_columns(df1, df1, "first", TRUE), "Data frame names must be character strings")
})

test_that("compare_columns works with tibbles", {
skip_if_not_installed("tibble")
library(tibble)

tbl1 <- tibble(a = 1, b = 2)
tbl2 <- tibble(b = 2, c = 3)
result <- compare_columns(df1, df2)

result <- compare_columns(tbl1, tbl2)
expect_equal(result$unique_to_df1, "a")
expect_equal(result$unique_to_df2, "c")
expect_equal(result$summary_data$row_counts$df1, 0)
expect_equal(result$summary_data$row_counts$df2, 0)
expect_equal(result$summary_data$unique_cols$df1, "a")
})

test_that("compare_columns preserves column order", {
df1 <- data.frame(c = 1, a = 2, b = 3)
df2 <- data.frame(d = 1, b = 2, e = 3)
test_that("compare_columns identifies duplicate rows correctly", {
df1 <- data.frame(a = c(1,1,2), b = c(2,2,3))
df2 <- data.frame(b = c(2,2,3), c = c(3,3,4))

result <- compare_columns(df1, df2)
expect_equal(result$unique_to_df1, c("c", "a"))
expect_equal(result$unique_to_df2, c("d", "e"))

expect_equal(result$summary_data$unique_rows$df1, 2)
expect_equal(result$summary_data$unique_rows$df2, 2)
})

0 comments on commit aaec8c0

Please sign in to comment.