diff --git a/R/join_closest.R b/R/join_closest.R new file mode 100644 index 0000000..7d19199 --- /dev/null +++ b/R/join_closest.R @@ -0,0 +1,155 @@ +#' Join Data Frames by Closest Time Match, with Optional Grouping +#' +#' @description +#' Joins two data frames by matching the closest timestamp in the right-hand data frame +#' to each timestamp in the left-hand data frame, optionally within groups. Can match to +#' either the closest preceding or following timestamp. +#' +#' @param x Left-hand data frame +#' @param y Right-hand data frame +#' @param x_time_col Name of the time column in x +#' @param y_time_col Name of the time column in y +#' @param direction Either '>=' (default) to match to closest preceding or equal value, +#' or '<=' to match to closest following or equal value +#' @param by Character vector of column names to group by. These columns must exist +#' in both data frames and will be joined exactly. +#' +#' @return A data frame containing all columns from x and all columns from y except y_time_col. +#' Rows in x with no matching timestamp in y will have NA values for y columns. +#' +#' @examples +#' x <- data.frame( +#' group = c(1,1,1, 2,2,2), +#' time = c(0,1,2, 0,1,2), +#' value1 = 1:6 +#' ) +#' y <- data.frame( +#' group = c(1,1, 2,2), +#' timestamp = c(0.5,1.5, 0.5,1.5), +#' value2 = letters[1:4] +#' ) +#' +#' # Match to closest preceding value within groups +#' join_closest(x, y, "time", "timestamp", by = "group", direction = ">=") +join_closest <- function(x, y, x_time_col, y_time_col, direction = ">=", by = NULL) { + # Input validation + if (!is.data.frame(x)) { + cli::cli_abort("'x' must be a data frame") + } + if (!is.data.frame(y)) { + cli::cli_abort("'y' must be a data frame") + } + + if (!x_time_col %in% names(x)) { + cli::cli_abort("Column {.val {x_time_col}} not found in 'x' data frame") + } + if (!y_time_col %in% names(y)) { + cli::cli_abort("Column {.val {y_time_col}} not found in 'y' data frame") + } + + if (!is.numeric(x[[x_time_col]])) { + cli::cli_abort("Column {.val {x_time_col}} in 'x' must be numeric") + } + if (!is.numeric(y[[y_time_col]])) { + cli::cli_abort("Column {.val {y_time_col}} in 'y' must be numeric") + } + + if (!direction %in% c("<=", ">=")) { + cli::cli_abort("'direction' must be either '<=' or '>='") + } + + # Validate grouping columns if provided + if (!is.null(by)) { + if (!all(by %in% names(x))) { + missing <- setdiff(by, names(x)) + cli::cli_abort("Grouping columns {.val {missing}} not found in 'x'") + } + if (!all(by %in% names(y))) { + missing <- setdiff(by, names(y)) + cli::cli_abort("Grouping columns {.val {missing}} not found in 'y'") + } + } + + # Check for empty data frames + if (nrow(x) == 0) { + cli::cli_warn("'x' has 0 rows") + return(x) + } + if (nrow(y) == 0) { + cli::cli_warn("'y' has 0 rows") + return(x) + } + + # If no grouping, use original function logic + if (is.null(by)) { + if (is.unsorted(y[[y_time_col]])) { + cli::cli_warn("Timestamps in 'y' are not sorted. Sorting automatically.") + y <- y[order(y[[y_time_col]]), , drop = FALSE] + } + + if (anyDuplicated(y[[y_time_col]])) { + cli::cli_warn("Duplicate timestamps found in 'y'. Using first occurrence.") + } + + return(closest_join_ungrouped(x, y, x_time_col, y_time_col, direction)) + } + + # Split data into groups and apply join within each group + result <- x + y_cols <- setdiff(names(y), c(y_time_col, by)) + result[y_cols] <- NA + + # Create group identifiers + x_groups <- do.call(paste, c(x[by], sep = "\001")) + y_groups <- do.call(paste, c(y[by], sep = "\001")) + + # For each unique group in x + for (group in unique(x_groups)) { + x_idx <- which(x_groups == group) + y_idx <- which(y_groups == group) + + if (length(y_idx) == 0) next + + # Get subset of data for this group + x_subset <- x[x_idx, , drop = FALSE] + y_subset <- y[y_idx, , drop = FALSE] + + # Sort y timestamps within group + y_subset <- y_subset[order(y_subset[[y_time_col]]), , drop = FALSE] + + # Perform join for this group + matched <- closest_join_ungrouped(x_subset, y_subset, x_time_col, y_time_col, direction) + + # Update result + result[x_idx, y_cols] <- matched[y_cols] + } + + return(result) +} + +# Internal function for ungrouped joining +closest_join_ungrouped <- function(x, y, x_time_col, y_time_col, direction) { + x_times <- x[[x_time_col]] + y_times <- y[[y_time_col]] + + if (direction == ">=") { + matches <- findInterval(x_times, y_times) + matches[matches == 0] <- NA + matches[matches == length(y_times) & x_times > max(y_times)] <- NA + } else { + matches <- findInterval(x_times, y_times, rightmost.closed = TRUE) + matches[matches < length(y_times)] <- matches[matches < length(y_times)] + 1 + matches[x_times <= min(y_times)] <- 1 + } + + # Create result data frame with NAs where needed + y_subset <- y[, setdiff(names(y), y_time_col), drop = FALSE] + result <- x + + # Add each column from y separately to handle NAs properly + for(col in names(y_subset)) { + result[[col]] <- y_subset[[col]][matches] + } + + return(result) +} diff --git a/man/join_closest.Rd b/man/join_closest.Rd new file mode 100644 index 0000000..2bf0d95 --- /dev/null +++ b/man/join_closest.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/join_closest.R +\name{join_closest} +\alias{join_closest} +\title{Join Data Frames by Closest Time Match, with Optional Grouping} +\usage{ +join_closest(x, y, x_time_col, y_time_col, direction = ">=", by = NULL) +} +\arguments{ +\item{x}{Left-hand data frame} + +\item{y}{Right-hand data frame} + +\item{x_time_col}{Name of the time column in x} + +\item{y_time_col}{Name of the time column in y} + +\item{direction}{Either '>=' (default) to match to closest preceding or equal value, +or '<=' to match to closest following or equal value} + +\item{by}{Character vector of column names to group by. These columns must exist +in both data frames and will be joined exactly.} +} +\value{ +A data frame containing all columns from x and all columns from y except y_time_col. +Rows in x with no matching timestamp in y will have NA values for y columns. +} +\description{ +Joins two data frames by matching the closest timestamp in the right-hand data frame +to each timestamp in the left-hand data frame, optionally within groups. Can match to +either the closest preceding or following timestamp. +} +\examples{ +x <- data.frame( + group = c(1,1,1, 2,2,2), + time = c(0,1,2, 0,1,2), + value1 = 1:6 +) +y <- data.frame( + group = c(1,1, 2,2), + timestamp = c(0.5,1.5, 0.5,1.5), + value2 = letters[1:4] +) + +# Match to closest preceding value within groups +join_closest(x, y, "time", "timestamp", by = "group", direction = ">=") +} diff --git a/tests/testthat/test-join_closest.R b/tests/testthat/test-join_closest.R new file mode 100644 index 0000000..25e0395 --- /dev/null +++ b/tests/testthat/test-join_closest.R @@ -0,0 +1,185 @@ +library(testthat) + +# # Example usage: +data1 <- data.frame( + time = seq(from = 0, to = 10, length.out = 10), + value1 = rnorm(10) +) + +data2 <- data.frame( + timestamp = seq(from = 0.05, to = 10, length.out = 8), + value2 = rnorm(8) +) +data1 +data2 + +jb <- dplyr::join_by(dplyr::closest(time > timestamp)) +result1 <- dplyr::left_join(data1, data2, by = jb) |> + dplyr::select(-time) + +result2 <- join_closest(data2, data1, "timestamp", "time", direction = ">=") +all(identical(result1, result2)) +result1 +result2 + +microbenchmark::microbenchmark( + dp = dplyr::left_join(data1, data2, by = jb), + us = join_closest(data1, data2, "time", "timestamp", direction = ">=") +) + + + +test_that("join_closest basic functionality works", { + x <- data.frame( + time = c(0, 1, 2, 3), + value1 = 1:4 + ) + y <- data.frame( + timestamp = c(0.5, 1.5, 2.5), + value2 = letters[1:3] + ) + + # Test >= direction + result_ge <- join_closest(x, y, "time", "timestamp", ">=") + expect_equal(nrow(result_ge), 4) + expect_equal(result_ge$value2, c(NA, "a", "a", "b")) + + # Test <= direction + result_le <- join_closest(x, y, "time", "timestamp", "<=") + expect_equal(nrow(result_le), 4) + expect_equal(result_le$value2, c("a", "b", "c", NA)) +}) + +test_that("join_closest handles edge cases", { + x <- data.frame( + time = c(-1, 0, 10, 11), + value1 = 1:4 + ) + y <- data.frame( + timestamp = c(0, 1, 2), + value2 = letters[1:3] + ) + + # Test values outside range + result <- join_closest(x, y, "time", "timestamp", ">=") + expect_equal(result$value2, c(NA, "a", "c", NA)) + + # Test empty data frames + expect_warning( + result <- join_closest(x[0,], y, "time", "timestamp"), + "'x' has 0 rows" + ) + expect_equal(nrow(result), 0) + + expect_warning( + result <- join_closest(x, y[0,], "time", "timestamp"), + "'y' has 0 rows" + ) + expect_equal(ncol(result), ncol(x)) +}) + +test_that("join_closest validates inputs", { + x <- data.frame(time = 1) + y <- data.frame(timestamp = 1) + + # Test invalid data frame + expect_error( + join_closest(list(), y, "time", "timestamp"), + "'x' must be a data frame" + ) + + # Test missing column + expect_error( + join_closest(x, y, "wrong", "timestamp"), + "Column 'wrong' not found in 'x' data frame" + ) + + # Test non-numeric column + x$time <- "a" + expect_error( + join_closest(x, y, "time", "timestamp"), + "Column 'time' in 'x' must be numeric" + ) + + # Test invalid direction + expect_error( + join_closest(x, y, "time", "timestamp", direction = ">"), + "'direction' must be either '<=' or '>='" + ) +}) + +test_that("join_closest handles unsorted and duplicate timestamps", { + x <- data.frame( + time = 1:3, + value1 = 1:3 + ) + y <- data.frame( + timestamp = c(1.5, 0.5, 2.5), # unsorted + value2 = letters[1:3] + ) + + # Test unsorted timestamps + expect_warning( + result <- join_closest(x, y, "time", "timestamp"), + "Timestamps in 'y' are not sorted" + ) + expect_equal(result$value2, c("b", "a", "a")) + + # Test duplicate timestamps + y2 <- data.frame( + timestamp = c(1.5, 1.5, 2.5), # duplicate + value2 = letters[1:3] + ) + expect_warning( + result <- join_closest(x, y2, "time", "timestamp"), + "Duplicate timestamps found in 'y'" + ) +}) + +test_that("join_closest handles grouped data", { + x <- data.frame( + group = c(1,1,1, 2,2,2), + time = c(0,1,2, 0,1,2), + value1 = 1:6 + ) + y <- data.frame( + group = c(1,1, 2,2), + timestamp = c(0.5,1.5, 0.5,1.5), + value2 = letters[1:4] + ) + + # Test >= direction with groups + result_ge <- join_closest(x, y, "time", "timestamp", ">=", by = "group") + expect_equal(nrow(result_ge), 6) + + # Mathc with dplyr + jb <- dplyr::join_by(group, dplyr::closest(time >= timestamp)) + result_dp <- dplyr::left_join(x, y, by = jb) + expect_equal(result_ge$value2, result_dp$value2) + + # Test <= direction with groups + result_le <- join_closest(x, y, "time", "timestamp", "<=", by = "group") + expect_equal(nrow(result_le), 6) + expect_equal(result_le$value2, c("a", "b", NA, "c", "d", NA)) + + # Test multiple grouping columns + x$group2 <- rep(c("A", "B"), each = 3) + y$group2 <- rep("A", 4) + result <- join_closest(x, y, "time", "timestamp", by = c("group", "group2")) + expect_equal(sum(!is.na(result$value2)), 3) # Only group 1, A should match +}) + +test_that("join_closest validates grouping columns", { + x <- data.frame(group = 1, time = 1) + y <- data.frame(other = 1, timestamp = 1) + + expect_error( + join_closest(x, y, "time", "timestamp", by = "group"), + "Grouping columns .* not found in 'y'" + ) + + expect_error( + join_closest(x, y, "time", "timestamp", by = "other"), + "Grouping columns .* not found in 'x'" + ) +})