From aaec8c0e4a3027814686c2cc2e0bf5de86dc54cd Mon Sep 17 00:00:00 2001 From: PorwalS Date: Tue, 21 Jan 2025 14:51:42 -0500 Subject: [PATCH] Update compare_columns function and tests --- DESCRIPTION | 6 +- NAMESPACE | 11 ++ R/compare_columns.R | 153 +++++++++++++++++++++----- man/compare_columns.Rd | 31 ++++++ tests/testthat/test-compare_columns.R | 103 +++++------------ 5 files changed, 204 insertions(+), 100 deletions(-) create mode 100644 man/compare_columns.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 7a2ec48..10f5343 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,13 +14,17 @@ RoxygenNote: 7.3.2 Suggests: knitr, rmarkdown, - testthat (>= 3.0.0) + testthat (>= 3.0.0), + tibble Config/testthat/edition: 3 Imports: dplyr, + gt, janitor, lubridate, + magrittr, purrr, + rlang, stats, stringr, tidyr diff --git a/NAMESPACE b/NAMESPACE index ca4827f..aa83dd7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(check_duplicates) export(compare_clean_data) +export(compare_columns) export(compare_df) export(from_win_to_mac) export(get_sql) @@ -10,4 +11,14 @@ export(parse_function) export(read_raw_data) export(to_sql_query) import(dplyr) +importFrom(gt,cell_borders) +importFrom(gt,cells_body) +importFrom(gt,cols_label) +importFrom(gt,fmt_number) +importFrom(gt,gt) +importFrom(gt,px) +importFrom(gt,tab_header) +importFrom(gt,tab_style) +importFrom(magrittr,"%>%") +importFrom(rlang,.data) importFrom(stats,na.omit) diff --git a/R/compare_columns.R b/R/compare_columns.R index 5eb5a89..0c60b63 100644 --- a/R/compare_columns.R +++ b/R/compare_columns.R @@ -1,26 +1,17 @@ -#' Compare Column Names Between Two Data Frames -#' -#' This function identifies the differences in column names between two data frames, -#' showing which columns are unique to each data frame. +#' Compare and Analyze Two Data Frames #' #' @param df1 A data frame or tibble #' @param df2 A data frame or tibble #' @param df1_name Character string naming the first data frame (default: "df1") #' @param df2_name Character string naming the second data frame (default: "df2") +#' @param group_by_col Character string specifying column name for grouping (default: NULL) #' -#' @return A list with two character vectors: -#' \itemize{ -#' \item `unique_to_df1`: Column names present only in df1 -#' \item `unique_to_df2`: Column names present only in df2 -#' } -#' -#' @examples -#' df1 <- data.frame(a = 1, b = 2, c = 3) -#' df2 <- data.frame(b = 2, c = 3, d = 4) -#' compare_columns(df1, df2, "first_df", "second_df") -#' +#' @return A list containing comparison results and filtered dataframes +#' @importFrom gt gt tab_header cols_label fmt_number tab_style cell_borders px cells_body +#' @importFrom magrittr %>% +#' @importFrom rlang .data #' @export -compare_columns <- function(df1, df2, df1_name = "df1", df2_name = "df2") { +compare_columns <- function(df1, df2, df1_name = "df1", df2_name = "df2", group_by_col = NULL) { # Input validation if (!is.data.frame(df1) || !is.data.frame(df2)) { stop("Both inputs must be data frames or tibbles") @@ -30,21 +21,131 @@ compare_columns <- function(df1, df2, df1_name = "df1", df2_name = "df2") { stop("Data frame names must be character strings") } - # Find unique columns in each dataframe + # Calculate comparisons cols_unique_to_df1 <- setdiff(names(df1), names(df2)) cols_unique_to_df2 <- setdiff(names(df2), names(df1)) + mutual_cols <- intersect(names(df1), names(df2)) + + # Create filtered dataframes + df1_filtered <- df1[, cols_unique_to_df1, drop = FALSE] + df2_filtered <- df2[, cols_unique_to_df2, drop = FALSE] + + # Create summary dataframe for gt + summary_df <- data.frame( + Metric = c("Total Columns", "Unique Columns", "Total Rows", "Unique Rows", "Mutual Columns"), + df1_value = c( + ncol(df1), + length(cols_unique_to_df1), + nrow(df1), + nrow(unique(df1)), + length(mutual_cols) + ), + df2_value = c( + ncol(df2), + length(cols_unique_to_df2), + nrow(df2), + nrow(unique(df2)), + NA # NA for mutual columns in df2 + ) + ) - # Create return list - result <- list( - unique_to_df1 = cols_unique_to_df1, - unique_to_df2 = cols_unique_to_df2 + # Create columns dataframe for gt + cols_df <- data.frame( + Category = c( + paste("Unique to", df1_name), + paste("Unique to", df2_name), + "Mutual Columns" + ), + Columns = c( + paste(cols_unique_to_df1, collapse = ", "), + paste(cols_unique_to_df2, collapse = ", "), + paste(mutual_cols, collapse = ", ") + ) ) - # Add names attribute for clarity - names(result) <- c( - paste0("unique_to_", df1_name), - paste0("unique_to_", df2_name) + # Create and print summary table + summary_table <- summary_df %>% + gt() %>% + tab_header( + title = "DataFrame Comparison Summary" + ) %>% + cols_label( + Metric = "Metric", + df1_value = df1_name, + df2_value = df2_name + ) %>% + fmt_number( + columns = c(.data$df1_value, .data$df2_value), + decimals = 0 + ) %>% + tab_style( + style = cell_borders( + sides = c("top", "bottom"), + color = "gray85", + weight = px(1) + ), + locations = cells_body() + ) + + # Create and print columns table + cols_table <- cols_df %>% + gt() %>% + tab_header( + title = "Column Details" + ) %>% + tab_style( + style = cell_borders( + sides = c("top", "bottom"), + color = "gray85", + weight = px(1) + ), + locations = cells_body() + ) + + # Print tables + print(summary_table) + print(cols_table) + + # Store results + summary_data <- list( + unique_cols = list( + df1 = cols_unique_to_df1, + df2 = cols_unique_to_df2 + ), + total_cols = list( + df1 = ncol(df1), + df2 = ncol(df2) + ), + mutual_cols = mutual_cols, + row_counts = list( + df1 = nrow(df1), + df2 = nrow(df2) + ), + unique_rows = list( + df1 = nrow(unique(df1)), + df2 = nrow(unique(df2)) + ) ) - return(result) + # Return list with all data + results <- list( + filtered_dfs = list( + df1 = df1_filtered, + df2 = df2_filtered + ), + summary_data = summary_data, + tables = list( + summary_table = summary_table, + cols_table = cols_table + ) + ) + + # Add names for easier access + names(results$filtered_dfs) <- c(df1_name, df2_name) + names(results$summary_data$unique_cols) <- c(df1_name, df2_name) + names(results$summary_data$total_cols) <- c(df1_name, df2_name) + names(results$summary_data$row_counts) <- c(df1_name, df2_name) + names(results$summary_data$unique_rows) <- c(df1_name, df2_name) + + return(invisible(results)) } \ No newline at end of file diff --git a/man/compare_columns.Rd b/man/compare_columns.Rd new file mode 100644 index 0000000..e371bc9 --- /dev/null +++ b/man/compare_columns.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_columns.R +\name{compare_columns} +\alias{compare_columns} +\title{Compare and Analyze Two Data Frames} +\usage{ +compare_columns( + df1, + df2, + df1_name = "df1", + df2_name = "df2", + group_by_col = NULL +) +} +\arguments{ +\item{df1}{A data frame or tibble} + +\item{df2}{A data frame or tibble} + +\item{df1_name}{Character string naming the first data frame (default: "df1")} + +\item{df2_name}{Character string naming the second data frame (default: "df2")} + +\item{group_by_col}{Character string specifying column name for grouping (default: NULL)} +} +\value{ +A list containing comparison results and filtered dataframes +} +\description{ +Compare and Analyze Two Data Frames +} diff --git a/tests/testthat/test-compare_columns.R b/tests/testthat/test-compare_columns.R index da609cf..663c002 100644 --- a/tests/testthat/test-compare_columns.R +++ b/tests/testthat/test-compare_columns.R @@ -1,86 +1,43 @@ -# tests/testthat/test-compare_columns.R - -test_that("compare_columns identifies differences correctly", { - # Basic test with different columns - df1 <- data.frame(a = 1, b = 2, c = 3) - df2 <- data.frame(b = 2, c = 3, d = 4) - result <- compare_columns(df1, df2) - - expect_equal(result$unique_to_df1, "a") - expect_equal(result$unique_to_df2, "d") - - # Test with identical dataframes - df3 <- data.frame(x = 1, y = 2) - df4 <- data.frame(x = 3, y = 4) - result2 <- compare_columns(df3, df4) - - expect_equal(length(result2$unique_to_df1), 0) - expect_equal(length(result2$unique_to_df2), 0) - - # Test with completely different columns - df5 <- data.frame(a = 1, b = 2) - df6 <- data.frame(c = 3, d = 4) - result3 <- compare_columns(df5, df6) - - expect_equal(result3$unique_to_df1, c("a", "b")) - expect_equal(result3$unique_to_df2, c("c", "d")) +test_that("compare_columns handles basic comparison correctly", { + df1 <- data.frame(a = 1:3, b = 2:4, c = 3:5) + df2 <- data.frame(b = 2:4, c = 3:5, d = 4:6) + + result <- compare_columns(df1, df2, "DF1", "DF2") + + # Test core functionality + expect_equal(result$summary_data$unique_cols$DF1, "a") + expect_equal(result$summary_data$unique_cols$DF2, "d") + expect_equal(result$summary_data$mutual_cols, c("b", "c")) + expect_equal(result$summary_data$total_cols$DF1, 3) + expect_equal(result$summary_data$total_cols$DF2, 3) }) -test_that("compare_columns handles custom names correctly", { - df1 <- data.frame(a = 1, b = 2) - df2 <- data.frame(b = 2, c = 3) +test_that("compare_columns validates input correctly", { + df1 <- data.frame(a = 1:3) + not_df <- list(a = 1:3) - result <- compare_columns(df1, df2, "first", "second") - - # Test custom naming - expect_equal(names(result), c("unique_to_first", "unique_to_second")) - expect_equal(result$unique_to_first, "a") - expect_equal(result$unique_to_second, "c") + # Test input validation + expect_error(compare_columns(df1, not_df), "Both inputs must be data frames") + expect_error(compare_columns(df1, df1, 1, "df2"), "Data frame names must be character") }) test_that("compare_columns handles empty dataframes", { - df1 <- data.frame() - df2 <- data.frame(a = 1) - - result1 <- compare_columns(df1, df2) - expect_equal(length(result1$unique_to_df1), 0) - expect_equal(result1$unique_to_df2, "a") + df1 <- data.frame(a = numeric(0)) + df2 <- data.frame(b = numeric(0)) - result2 <- compare_columns(df2, df1) - expect_equal(result2$unique_to_df1, "a") - expect_equal(length(result2$unique_to_df2), 0) -}) - -test_that("compare_columns validates inputs correctly", { - df1 <- data.frame(a = 1) - not_df <- list(a = 1) - - # Test invalid dataframe inputs - expect_error(compare_columns(not_df, df1), "Both inputs must be data frames or tibbles") - expect_error(compare_columns(df1, not_df), "Both inputs must be data frames or tibbles") - - # Test invalid name inputs - expect_error(compare_columns(df1, df1, 1, "second"), "Data frame names must be character strings") - expect_error(compare_columns(df1, df1, "first", TRUE), "Data frame names must be character strings") -}) - -test_that("compare_columns works with tibbles", { - skip_if_not_installed("tibble") - library(tibble) - - tbl1 <- tibble(a = 1, b = 2) - tbl2 <- tibble(b = 2, c = 3) + result <- compare_columns(df1, df2) - result <- compare_columns(tbl1, tbl2) - expect_equal(result$unique_to_df1, "a") - expect_equal(result$unique_to_df2, "c") + expect_equal(result$summary_data$row_counts$df1, 0) + expect_equal(result$summary_data$row_counts$df2, 0) + expect_equal(result$summary_data$unique_cols$df1, "a") }) -test_that("compare_columns preserves column order", { - df1 <- data.frame(c = 1, a = 2, b = 3) - df2 <- data.frame(d = 1, b = 2, e = 3) +test_that("compare_columns identifies duplicate rows correctly", { + df1 <- data.frame(a = c(1,1,2), b = c(2,2,3)) + df2 <- data.frame(b = c(2,2,3), c = c(3,3,4)) result <- compare_columns(df1, df2) - expect_equal(result$unique_to_df1, c("c", "a")) - expect_equal(result$unique_to_df2, c("d", "e")) + + expect_equal(result$summary_data$unique_rows$df1, 2) + expect_equal(result$summary_data$unique_rows$df2, 2) }) \ No newline at end of file