Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace join_by() #98

Draft
wants to merge 1 commit into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 155 additions & 0 deletions R/join_closest.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#' Join Data Frames by Closest Time Match, with Optional Grouping
#'
#' @description
#' Joins two data frames by matching the closest timestamp in the right-hand data frame
#' to each timestamp in the left-hand data frame, optionally within groups. Can match to
#' either the closest preceding or following timestamp.
#'
#' @param x Left-hand data frame
#' @param y Right-hand data frame
#' @param x_time_col Name of the time column in x
#' @param y_time_col Name of the time column in y
#' @param direction Either '>=' (default) to match to closest preceding or equal value,
#' or '<=' to match to closest following or equal value
#' @param by Character vector of column names to group by. These columns must exist
#' in both data frames and will be joined exactly.
#'
#' @return A data frame containing all columns from x and all columns from y except y_time_col.
#' Rows in x with no matching timestamp in y will have NA values for y columns.
#'
#' @examples
#' x <- data.frame(
#' group = c(1,1,1, 2,2,2),
#' time = c(0,1,2, 0,1,2),
#' value1 = 1:6
#' )
#' y <- data.frame(
#' group = c(1,1, 2,2),
#' timestamp = c(0.5,1.5, 0.5,1.5),
#' value2 = letters[1:4]
#' )
#'
#' # Match to closest preceding value within groups
#' join_closest(x, y, "time", "timestamp", by = "group", direction = ">=")
join_closest <- function(x, y, x_time_col, y_time_col, direction = ">=", by = NULL) {
# Input validation
if (!is.data.frame(x)) {
cli::cli_abort("'x' must be a data frame")
}
if (!is.data.frame(y)) {
cli::cli_abort("'y' must be a data frame")
}

if (!x_time_col %in% names(x)) {
cli::cli_abort("Column {.val {x_time_col}} not found in 'x' data frame")
}
if (!y_time_col %in% names(y)) {
cli::cli_abort("Column {.val {y_time_col}} not found in 'y' data frame")
}

if (!is.numeric(x[[x_time_col]])) {
cli::cli_abort("Column {.val {x_time_col}} in 'x' must be numeric")
}
if (!is.numeric(y[[y_time_col]])) {
cli::cli_abort("Column {.val {y_time_col}} in 'y' must be numeric")
}

if (!direction %in% c("<=", ">=")) {
cli::cli_abort("'direction' must be either '<=' or '>='")
}

# Validate grouping columns if provided
if (!is.null(by)) {
if (!all(by %in% names(x))) {
missing <- setdiff(by, names(x))
cli::cli_abort("Grouping columns {.val {missing}} not found in 'x'")
}
if (!all(by %in% names(y))) {
missing <- setdiff(by, names(y))
cli::cli_abort("Grouping columns {.val {missing}} not found in 'y'")
}
}

# Check for empty data frames
if (nrow(x) == 0) {
cli::cli_warn("'x' has 0 rows")
return(x)
}
if (nrow(y) == 0) {
cli::cli_warn("'y' has 0 rows")
return(x)
}

# If no grouping, use original function logic
if (is.null(by)) {
if (is.unsorted(y[[y_time_col]])) {
cli::cli_warn("Timestamps in 'y' are not sorted. Sorting automatically.")
y <- y[order(y[[y_time_col]]), , drop = FALSE]
}

if (anyDuplicated(y[[y_time_col]])) {
cli::cli_warn("Duplicate timestamps found in 'y'. Using first occurrence.")
}

return(closest_join_ungrouped(x, y, x_time_col, y_time_col, direction))
}

# Split data into groups and apply join within each group
result <- x
y_cols <- setdiff(names(y), c(y_time_col, by))
result[y_cols] <- NA

# Create group identifiers
x_groups <- do.call(paste, c(x[by], sep = "\001"))
y_groups <- do.call(paste, c(y[by], sep = "\001"))

# For each unique group in x
for (group in unique(x_groups)) {
x_idx <- which(x_groups == group)
y_idx <- which(y_groups == group)

if (length(y_idx) == 0) next

# Get subset of data for this group
x_subset <- x[x_idx, , drop = FALSE]
y_subset <- y[y_idx, , drop = FALSE]

# Sort y timestamps within group
y_subset <- y_subset[order(y_subset[[y_time_col]]), , drop = FALSE]

# Perform join for this group
matched <- closest_join_ungrouped(x_subset, y_subset, x_time_col, y_time_col, direction)

# Update result
result[x_idx, y_cols] <- matched[y_cols]
}

return(result)
}

# Internal function for ungrouped joining
closest_join_ungrouped <- function(x, y, x_time_col, y_time_col, direction) {
x_times <- x[[x_time_col]]
y_times <- y[[y_time_col]]

if (direction == ">=") {
matches <- findInterval(x_times, y_times)
matches[matches == 0] <- NA
matches[matches == length(y_times) & x_times > max(y_times)] <- NA
} else {
matches <- findInterval(x_times, y_times, rightmost.closed = TRUE)
matches[matches < length(y_times)] <- matches[matches < length(y_times)] + 1
matches[x_times <= min(y_times)] <- 1
}

# Create result data frame with NAs where needed
y_subset <- y[, setdiff(names(y), y_time_col), drop = FALSE]
result <- x

# Add each column from y separately to handle NAs properly
for(col in names(y_subset)) {
result[[col]] <- y_subset[[col]][matches]
}

return(result)
}
47 changes: 47 additions & 0 deletions man/join_closest.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

185 changes: 185 additions & 0 deletions tests/testthat/test-join_closest.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
library(testthat)

# # Example usage:
data1 <- data.frame(
time = seq(from = 0, to = 10, length.out = 10),
value1 = rnorm(10)
)

data2 <- data.frame(
timestamp = seq(from = 0.05, to = 10, length.out = 8),
value2 = rnorm(8)
)
data1
data2

jb <- dplyr::join_by(dplyr::closest(time > timestamp))
result1 <- dplyr::left_join(data1, data2, by = jb) |>
dplyr::select(-time)

result2 <- join_closest(data2, data1, "timestamp", "time", direction = ">=")
all(identical(result1, result2))
result1
result2

microbenchmark::microbenchmark(
dp = dplyr::left_join(data1, data2, by = jb),
us = join_closest(data1, data2, "time", "timestamp", direction = ">=")
)



test_that("join_closest basic functionality works", {
x <- data.frame(
time = c(0, 1, 2, 3),
value1 = 1:4
)
y <- data.frame(
timestamp = c(0.5, 1.5, 2.5),
value2 = letters[1:3]
)

# Test >= direction
result_ge <- join_closest(x, y, "time", "timestamp", ">=")
expect_equal(nrow(result_ge), 4)
expect_equal(result_ge$value2, c(NA, "a", "a", "b"))

# Test <= direction
result_le <- join_closest(x, y, "time", "timestamp", "<=")
expect_equal(nrow(result_le), 4)
expect_equal(result_le$value2, c("a", "b", "c", NA))
})

test_that("join_closest handles edge cases", {
x <- data.frame(
time = c(-1, 0, 10, 11),
value1 = 1:4
)
y <- data.frame(
timestamp = c(0, 1, 2),
value2 = letters[1:3]
)

# Test values outside range
result <- join_closest(x, y, "time", "timestamp", ">=")
expect_equal(result$value2, c(NA, "a", "c", NA))

# Test empty data frames
expect_warning(
result <- join_closest(x[0,], y, "time", "timestamp"),
"'x' has 0 rows"
)
expect_equal(nrow(result), 0)

expect_warning(
result <- join_closest(x, y[0,], "time", "timestamp"),
"'y' has 0 rows"
)
expect_equal(ncol(result), ncol(x))
})

test_that("join_closest validates inputs", {
x <- data.frame(time = 1)
y <- data.frame(timestamp = 1)

# Test invalid data frame
expect_error(
join_closest(list(), y, "time", "timestamp"),
"'x' must be a data frame"
)

# Test missing column
expect_error(
join_closest(x, y, "wrong", "timestamp"),
"Column 'wrong' not found in 'x' data frame"
)

# Test non-numeric column
x$time <- "a"
expect_error(
join_closest(x, y, "time", "timestamp"),
"Column 'time' in 'x' must be numeric"
)

# Test invalid direction
expect_error(
join_closest(x, y, "time", "timestamp", direction = ">"),
"'direction' must be either '<=' or '>='"
)
})

test_that("join_closest handles unsorted and duplicate timestamps", {
x <- data.frame(
time = 1:3,
value1 = 1:3
)
y <- data.frame(
timestamp = c(1.5, 0.5, 2.5), # unsorted
value2 = letters[1:3]
)

# Test unsorted timestamps
expect_warning(
result <- join_closest(x, y, "time", "timestamp"),
"Timestamps in 'y' are not sorted"
)
expect_equal(result$value2, c("b", "a", "a"))

# Test duplicate timestamps
y2 <- data.frame(
timestamp = c(1.5, 1.5, 2.5), # duplicate
value2 = letters[1:3]
)
expect_warning(
result <- join_closest(x, y2, "time", "timestamp"),
"Duplicate timestamps found in 'y'"
)
})

test_that("join_closest handles grouped data", {
x <- data.frame(
group = c(1,1,1, 2,2,2),
time = c(0,1,2, 0,1,2),
value1 = 1:6
)
y <- data.frame(
group = c(1,1, 2,2),
timestamp = c(0.5,1.5, 0.5,1.5),
value2 = letters[1:4]
)

# Test >= direction with groups
result_ge <- join_closest(x, y, "time", "timestamp", ">=", by = "group")
expect_equal(nrow(result_ge), 6)

# Mathc with dplyr
jb <- dplyr::join_by(group, dplyr::closest(time >= timestamp))
result_dp <- dplyr::left_join(x, y, by = jb)
expect_equal(result_ge$value2, result_dp$value2)

# Test <= direction with groups
result_le <- join_closest(x, y, "time", "timestamp", "<=", by = "group")
expect_equal(nrow(result_le), 6)
expect_equal(result_le$value2, c("a", "b", NA, "c", "d", NA))

# Test multiple grouping columns
x$group2 <- rep(c("A", "B"), each = 3)
y$group2 <- rep("A", 4)
result <- join_closest(x, y, "time", "timestamp", by = c("group", "group2"))
expect_equal(sum(!is.na(result$value2)), 3) # Only group 1, A should match
})

test_that("join_closest validates grouping columns", {
x <- data.frame(group = 1, time = 1)
y <- data.frame(other = 1, timestamp = 1)

expect_error(
join_closest(x, y, "time", "timestamp", by = "group"),
"Grouping columns .* not found in 'y'"
)

expect_error(
join_closest(x, y, "time", "timestamp", by = "other"),
"Grouping columns .* not found in 'x'"
)
})
Loading