roaldarbol · roaldarbol · Jan 8, 2025
diff --git a/R/join_closest.R b/R/join_closest.R
@@ -0,0 +1,155 @@
+#' Join Data Frames by Closest Time Match, with Optional Grouping
+#'
+#' @description
+#' Joins two data frames by matching the closest timestamp in the right-hand data frame
+#' to each timestamp in the left-hand data frame, optionally within groups. Can match to
+#' either the closest preceding or following timestamp.
+#'
+#' @param x Left-hand data frame
+#' @param y Right-hand data frame
+#' @param x_time_col Name of the time column in x
+#' @param y_time_col Name of the time column in y
+#' @param direction Either '>=' (default) to match to closest preceding or equal value,
+#'   or '<=' to match to closest following or equal value
+#' @param by Character vector of column names to group by. These columns must exist
+#'   in both data frames and will be joined exactly.
+#'
+#' @return A data frame containing all columns from x and all columns from y except y_time_col.
+#'   Rows in x with no matching timestamp in y will have NA values for y columns.
+#'
+#' @examples
+#' x <- data.frame(
+#'   group = c(1,1,1, 2,2,2),
+#'   time = c(0,1,2, 0,1,2),
+#'   value1 = 1:6
+#' )
+#' y <- data.frame(
+#'   group = c(1,1, 2,2),
+#'   timestamp = c(0.5,1.5, 0.5,1.5),
+#'   value2 = letters[1:4]
+#' )
+#'
+#' # Match to closest preceding value within groups
+#' join_closest(x, y, "time", "timestamp", by = "group", direction = ">=")
+join_closest <- function(x, y, x_time_col, y_time_col, direction = ">=", by = NULL) {
+  # Input validation
+  if (!is.data.frame(x)) {
+    cli::cli_abort("'x' must be a data frame")
+  }
+  if (!is.data.frame(y)) {
+    cli::cli_abort("'y' must be a data frame")
+  }
+
+  if (!x_time_col %in% names(x)) {
+    cli::cli_abort("Column {.val {x_time_col}} not found in 'x' data frame")
+  }
+  if (!y_time_col %in% names(y)) {
+    cli::cli_abort("Column {.val {y_time_col}} not found in 'y' data frame")
+  }
+
+  if (!is.numeric(x[[x_time_col]])) {
+    cli::cli_abort("Column {.val {x_time_col}} in 'x' must be numeric")
+  }
+  if (!is.numeric(y[[y_time_col]])) {
+    cli::cli_abort("Column {.val {y_time_col}} in 'y' must be numeric")
+  }
+
+  if (!direction %in% c("<=", ">=")) {
+    cli::cli_abort("'direction' must be either '<=' or '>='")
+  }
+
+  # Validate grouping columns if provided
+  if (!is.null(by)) {
+    if (!all(by %in% names(x))) {
+      missing <- setdiff(by, names(x))
+      cli::cli_abort("Grouping columns {.val {missing}} not found in 'x'")
+    }
+    if (!all(by %in% names(y))) {
+      missing <- setdiff(by, names(y))
+      cli::cli_abort("Grouping columns {.val {missing}} not found in 'y'")
+    }
+  }
+
+  # Check for empty data frames
+  if (nrow(x) == 0) {
+    cli::cli_warn("'x' has 0 rows")
+    return(x)
+  }
+  if (nrow(y) == 0) {
+    cli::cli_warn("'y' has 0 rows")
+    return(x)
+  }
+
+  # If no grouping, use original function logic
+  if (is.null(by)) {
+    if (is.unsorted(y[[y_time_col]])) {
+      cli::cli_warn("Timestamps in 'y' are not sorted. Sorting automatically.")
+      y <- y[order(y[[y_time_col]]), , drop = FALSE]
+    }
+
+    if (anyDuplicated(y[[y_time_col]])) {
+      cli::cli_warn("Duplicate timestamps found in 'y'. Using first occurrence.")
+    }
+
+    return(closest_join_ungrouped(x, y, x_time_col, y_time_col, direction))
+  }
+
+  # Split data into groups and apply join within each group
+  result <- x
+  y_cols <- setdiff(names(y), c(y_time_col, by))
+  result[y_cols] <- NA
+
+  # Create group identifiers
+  x_groups <- do.call(paste, c(x[by], sep = "\001"))
+  y_groups <- do.call(paste, c(y[by], sep = "\001"))
+
+  # For each unique group in x
+  for (group in unique(x_groups)) {
+    x_idx <- which(x_groups == group)
+    y_idx <- which(y_groups == group)
+
+    if (length(y_idx) == 0) next
+
+    # Get subset of data for this group
+    x_subset <- x[x_idx, , drop = FALSE]
+    y_subset <- y[y_idx, , drop = FALSE]
+
+    # Sort y timestamps within group
+    y_subset <- y_subset[order(y_subset[[y_time_col]]), , drop = FALSE]
+
+    # Perform join for this group
+    matched <- closest_join_ungrouped(x_subset, y_subset, x_time_col, y_time_col, direction)
+
+    # Update result
+    result[x_idx, y_cols] <- matched[y_cols]
+  }
+
+  return(result)
+}
+
+# Internal function for ungrouped joining
+closest_join_ungrouped <- function(x, y, x_time_col, y_time_col, direction) {
+  x_times <- x[[x_time_col]]
+  y_times <- y[[y_time_col]]
+
+  if (direction == ">=") {
+    matches <- findInterval(x_times, y_times)
+    matches[matches == 0] <- NA
+    matches[matches == length(y_times) & x_times > max(y_times)] <- NA
+  } else {
+    matches <- findInterval(x_times, y_times, rightmost.closed = TRUE)
+    matches[matches < length(y_times)] <- matches[matches < length(y_times)] + 1
+    matches[x_times <= min(y_times)] <- 1
+  }
+
+  # Create result data frame with NAs where needed
+  y_subset <- y[, setdiff(names(y), y_time_col), drop = FALSE]
+  result <- x
+
+  # Add each column from y separately to handle NAs properly
+  for(col in names(y_subset)) {
+    result[[col]] <- y_subset[[col]][matches]
+  }
+
+  return(result)
+}
diff --git a/man/join_closest.Rd b/man/join_closest.Rd
diff --git a/tests/testthat/test-join_closest.R b/tests/testthat/test-join_closest.R
@@ -0,0 +1,185 @@
+library(testthat)
+
+# # Example usage:
+data1 <- data.frame(
+  time = seq(from = 0, to = 10, length.out = 10),
+  value1 = rnorm(10)
+)
+
+data2 <- data.frame(
+  timestamp = seq(from = 0.05, to = 10, length.out = 8),
+  value2 = rnorm(8)
+)
+data1
+data2
+
+jb <- dplyr::join_by(dplyr::closest(time > timestamp))
+result1 <- dplyr::left_join(data1, data2, by = jb) |>
+  dplyr::select(-time)
+
+result2 <- join_closest(data2, data1, "timestamp", "time", direction = ">=")
+all(identical(result1, result2))
+result1
+result2
+
+microbenchmark::microbenchmark(
+  dp = dplyr::left_join(data1, data2, by = jb),
+  us = join_closest(data1, data2, "time", "timestamp", direction = ">=")
+)
+
+
+
+test_that("join_closest basic functionality works", {
+  x <- data.frame(
+    time = c(0, 1, 2, 3),
+    value1 = 1:4
+  )
+  y <- data.frame(
+    timestamp = c(0.5, 1.5, 2.5),
+    value2 = letters[1:3]
+  )
+
+  # Test >= direction
+  result_ge <- join_closest(x, y, "time", "timestamp", ">=")
+  expect_equal(nrow(result_ge), 4)
+  expect_equal(result_ge$value2, c(NA, "a", "a", "b"))
+
+  # Test <= direction
+  result_le <- join_closest(x, y, "time", "timestamp", "<=")
+  expect_equal(nrow(result_le), 4)
+  expect_equal(result_le$value2, c("a", "b", "c", NA))
+})
+
+test_that("join_closest handles edge cases", {
+  x <- data.frame(
+    time = c(-1, 0, 10, 11),
+    value1 = 1:4
+  )
+  y <- data.frame(
+    timestamp = c(0, 1, 2),
+    value2 = letters[1:3]
+  )
+
+  # Test values outside range
+  result <- join_closest(x, y, "time", "timestamp", ">=")
+  expect_equal(result$value2, c(NA, "a", "c", NA))
+
+  # Test empty data frames
+  expect_warning(
+    result <- join_closest(x[0,], y, "time", "timestamp"),
+    "'x' has 0 rows"
+  )
+  expect_equal(nrow(result), 0)
+
+  expect_warning(
+    result <- join_closest(x, y[0,], "time", "timestamp"),
+    "'y' has 0 rows"
+  )
+  expect_equal(ncol(result), ncol(x))
+})
+
+test_that("join_closest validates inputs", {
+  x <- data.frame(time = 1)
+  y <- data.frame(timestamp = 1)
+
+  # Test invalid data frame
+  expect_error(
+    join_closest(list(), y, "time", "timestamp"),
+    "'x' must be a data frame"
+  )
+
+  # Test missing column
+  expect_error(
+    join_closest(x, y, "wrong", "timestamp"),
+    "Column 'wrong' not found in 'x' data frame"
+  )
+
+  # Test non-numeric column
+  x$time <- "a"
+  expect_error(
+    join_closest(x, y, "time", "timestamp"),
+    "Column 'time' in 'x' must be numeric"
+  )
+
+  # Test invalid direction
+  expect_error(
+    join_closest(x, y, "time", "timestamp", direction = ">"),
+    "'direction' must be either '<=' or '>='"
+  )
+})
+
+test_that("join_closest handles unsorted and duplicate timestamps", {
+  x <- data.frame(
+    time = 1:3,
+    value1 = 1:3
+  )
+  y <- data.frame(
+    timestamp = c(1.5, 0.5, 2.5),  # unsorted
+    value2 = letters[1:3]
+  )
+
+  # Test unsorted timestamps
+  expect_warning(
+    result <- join_closest(x, y, "time", "timestamp"),
+    "Timestamps in 'y' are not sorted"
+  )
+  expect_equal(result$value2, c("b", "a", "a"))
+
+  # Test duplicate timestamps
+  y2 <- data.frame(
+    timestamp = c(1.5, 1.5, 2.5),  # duplicate
+    value2 = letters[1:3]
+  )
+  expect_warning(
+    result <- join_closest(x, y2, "time", "timestamp"),
+    "Duplicate timestamps found in 'y'"
+  )
+})
+
+test_that("join_closest handles grouped data", {
+  x <- data.frame(
+    group = c(1,1,1, 2,2,2),
+    time = c(0,1,2, 0,1,2),
+    value1 = 1:6
+  )
+  y <- data.frame(
+    group = c(1,1, 2,2),
+    timestamp = c(0.5,1.5, 0.5,1.5),
+    value2 = letters[1:4]
+  )
+
+  # Test >= direction with groups
+  result_ge <- join_closest(x, y, "time", "timestamp", ">=", by = "group")
+  expect_equal(nrow(result_ge), 6)
+
+  # Mathc with dplyr
+  jb <- dplyr::join_by(group, dplyr::closest(time >= timestamp))
+  result_dp <- dplyr::left_join(x, y, by = jb)
+  expect_equal(result_ge$value2, result_dp$value2)
+
+  # Test <= direction with groups
+  result_le <- join_closest(x, y, "time", "timestamp", "<=", by = "group")
+  expect_equal(nrow(result_le), 6)
+  expect_equal(result_le$value2, c("a", "b", NA, "c", "d", NA))
+
+  # Test multiple grouping columns
+  x$group2 <- rep(c("A", "B"), each = 3)
+  y$group2 <- rep("A", 4)
+  result <- join_closest(x, y, "time", "timestamp", by = c("group", "group2"))
+  expect_equal(sum(!is.na(result$value2)), 3) # Only group 1, A should match
+})
+
+test_that("join_closest validates grouping columns", {
+  x <- data.frame(group = 1, time = 1)
+  y <- data.frame(other = 1, timestamp = 1)
+
+  expect_error(
+    join_closest(x, y, "time", "timestamp", by = "group"),
+    "Grouping columns .* not found in 'y'"
+  )
+
+  expect_error(
+    join_closest(x, y, "time", "timestamp", by = "other"),
+    "Grouping columns .* not found in 'x'"
+  )
+})