Skip to content

Commit

Permalink
Closes #1130. No keys need on merge.data.table. No more deep copies.
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Aug 10, 2015
1 parent 8bf3de4 commit 44b1e00
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 93 deletions.
107 changes: 42 additions & 65 deletions R/merge.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,98 +2,75 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA
all.y = all, suffixes = c(".x", ".y"), allow.cartesian=getOption("datatable.allow.cartesian"), ...) {
if (!inherits(y, 'data.table')) {
y <- as.data.table(y)
if (missing(by)) {
if (missing(by) && missing(by.x)) {
by <- key(x)
}
}
if (any(duplicated(names(x)))) stop("x has some duplicated column name(s): ",paste(names(x)[duplicated(names(x))],collapse=","),". Please remove or rename the duplicate(s) and try again.")
if (any(duplicated(names(y)))) stop("y has some duplicated column name(s): ",paste(names(y)[duplicated(names(y))],collapse=","),". Please remove or rename the duplicate(s) and try again.")

## Determine by and rename columns of y, if by.x and by.y are supplied
if (!is.null(by.x)){
if (!is.null(by)) warning("Supplied both by and by.x and only by will be used")
else {
by <- by.x
if (length(by.x) != length(by.y)) stop("by.x and by.y must be of the same length")
setnames(y, by.y, by.x)
on.exit(setnames(y, by.x, by.y))
}
## set up 'by'/'by.x'/'by.y'
if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) )
stop("`by.x` and `by.y` must be of same length.")
if (!missing(by) && !missing(by.x))
warning("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.")
if (!is.null(by.x)) {
if ( !is.character(by.x) || !is.character(by.y))
stop("A non-empty vector of column names are required for `by.x` and `by.y`.")
if (!all(by.x %in% names(x)))
stop("Elements listed in `by.x` must be valid column names in x.")
if (!all(by.y %in% names(y)))
stop("Elements listed in `by.y` must be valid column names in y.")
by = by.x
names(by) = by.y
} else {
if (is.null(by))
by = intersect(key(x), key(y))
if (is.null(by))
by = key(x)
if (is.null(by))
stop("Can not match keys in x and y to automatically determine appropriate `by` parameter. Please set `by` value explicitly.")
if (length(by) == 0L || !is.character(by))
stop("A non-empty vector of column names for `by` is required.")
if (!all(by %in% intersect(colnames(x), colnames(y))))
stop("Elements listed in `by` must be valid column names in x and y")
by.x = by.y = by
}

## Try to infer proper value for `by`
if (is.null(by)) {
by <- intersect(key(x), key(y))
}
if (is.null(by)) {
by <- key(x)
}
if (is.null(by)) {
stop("Can not match keys in x and y to automatically determine ",
"appropriate `by` parameter. Please set `by` value explicitly.")
}
if (length(by) == 0L || !is.character(by)) {
stop("A non-empty vector of column names for `by` is required.")
}
if (!all(by %in% intersect(colnames(x), colnames(y)))) {
stop("Elements listed in `by` must be valid column names in x and y")
}

## Checks to see that keys on dt are set and are in correct order
.reset.keys <- function(dt, by) {
dt.key <- key(dt)
length(dt.key) < length(by) || !all(dt.key[1:length(by)] == by)
}

if (.reset.keys(y, by)) {
y=setkeyv(copy(y),by)
# TO DO Add a secondary key here, when implemented which would be cached in the object
}

xkey = if (identical(key(x),by)) x else setkeyv(copy(x),by)
# TO DO: new [.data.table argument joincols or better name would allow leaving x as is if by was a head subset
# of key(x). Also NAMED on each column would allow subset references. Also, a secondary key may be
# much simpler but just need an argument to tell [.data.table to use the 2key of i.

# with i. prefix in v1.9.3, this goes away. Left here for now ...
## sidestep the auto-increment column number feature-leading-to-bug by
## ensuring no names end in ".1", see unit test
## "merge and auto-increment columns in y[x]" in test-data.frame.like.R
dupnames <- setdiff(intersect(names(xkey), names(y)), by)
start = setdiff(names(x), by.x)
end = setdiff(names(y), by.y)
dupnames = intersect(start, end)
if (length(dupnames)) {
xkey = setnames(shallow(xkey), dupnames, sprintf("%s.", dupnames))
y = setnames(shallow(y), dupnames, sprintf("%s.", dupnames))
start[start %in% dupnames] = paste(dupnames, suffixes[1L], sep="")
end[end %in% dupnames] = paste(dupnames, suffixes[2L], sep="")
}

dt = y[xkey,nomatch=ifelse(all.x,NA,0),allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names)
dt = y[x,nomatch=ifelse(all.x,NA,0),on=by,allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names)

if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed
# Perhaps not very commonly used, so not a huge deal that the join is redone here.
missingyidx = seq.int(nrow(y))
whichy = y[xkey,which=TRUE,nomatch=0,allow.cartesian=allow.cartesian] # !!TO DO!!: Use not join (i=-xkey) here now that's implemented
whichy = y[x,which=TRUE,nomatch=0,on=by,allow.cartesian=allow.cartesian] # !!TO DO!!: Use not join (i=-x) here now that's implemented
whichy = whichy[whichy>0]
if (length(whichy)) missingyidx = missingyidx[-whichy]
if (length(missingyidx)) {
yy <- y[missingyidx]
othercolsx <- setdiff(names(xkey), by)
yy = y[missingyidx]
othercolsx = setdiff(names(x), by)
if (length(othercolsx)) {
tmp = rep.int(NA_integer_, length(missingyidx))
yy <- cbind(yy, xkey[tmp, othercolsx, with = FALSE])
yy = cbind(yy, x[tmp, othercolsx, with = FALSE])
}
dt = rbind(dt, yy, use.names=FALSE) # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist
# takes care of #5672 without having to save names. This is how it should be, IMHO.
}
}

end = setdiff(names(y),by) # X[Y] sytax puts JIS i columns at the end, merge likes them alongside i.
setcolorder(dt,c(setdiff(names(dt),end),end))

if (nrow(dt) > 0) setkeyv(dt,by)

if (length(dupnames)) {
setnames(dt, sprintf("%s.", dupnames), paste(dupnames, suffixes[2], sep=""))
setnames(dt, sprintf("i.%s.", dupnames), paste(dupnames, suffixes[1], sep=""))
}

# X[Y] sytax puts JIS i columns at the end, merge likes them alongside i.
newend = setdiff(names(y), by.y)
setcolorder(dt, c(setdiff(names(dt), newend), newend))
setnames(dt, c(by.x, start, end))
if (nrow(dt) > 0) setkeyv(dt, by.x)
dt
}

12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,16 @@

21. `setDF` also converts `list` of equal length to `data.frame` by reference now. Closes [#1132](https://github.com/Rdatatable/data.table/issues/1132).

22. `merge.data.table` now has new arguments `by.x` and `by.y`. Closes [#637](https://github.com/Rdatatable/data.table/issues/637). Thanks to @NelloBlaser.
22. `CJ` gains logical `unique` argument with default `FALSE`. If `TRUE`, unique values of vectors are automatically computed and used. This is convenient, for example, `DT[CJ(a, b, c, unique=TRUE)]` instead of doing `DT[CJ(unique(a), unique(b), unique(c))]`. Ultimately, `unique = TRUE` will be default. Closes [#1148](https://github.com/Rdatatable/data.table/issues/1148).

23. `CJ` gains logical `unique` argument with default `FALSE`. If `TRUE`, unique values of vectors are automatically computed and used. This is convenient, for example, `DT[CJ(a, b, c, unique=TRUE)]` instead of doing `DT[CJ(unique(a), unique(b), unique(c))]`. Ultimately, `unique = TRUE` will be default. Closes [#1148](https://github.com/Rdatatable/data.table/issues/1148).
23. Implemented `stringsAsFactors` argument for `fread()`. When `TRUE`, character columns are converted to factors. Default is `FALSE`. Thanks to Artem Klevtsov for filing [#501](https://github.com/Rdatatable/data.table/issues/501), and to @hmi2015 for [this SO post](http://stackoverflow.com/q/31350209/559784).

24. Implemented `stringsAsFactors` argument for `fread()`. When `TRUE`, character columns are converted to factors. Default is `FALSE`. Thanks to Artem Klevtsov for filing [#501](https://github.com/Rdatatable/data.table/issues/501), and to @hmi2015 for [this SO post](http://stackoverflow.com/q/31350209/559784).
24. `fread` gains `check.names` argument, with default value `FALSE`. When `TRUE`, it uses the base function `make.unique()` to ensure that the column names of the data.table read in are all unique. Thanks to David Arenburg for filing [#1027](https://github.com/Rdatatable/data.table/issues/1027).

25. `fread` gains `check.names` argument, with default value `FALSE`. When `TRUE`, it uses the base function `make.unique()` to ensure that the column names of the data.table read in are all unique. Thanks to David Arenburg for filing [#1027](https://github.com/Rdatatable/data.table/issues/1027).
25. data.tables can join now without having to set keys by using the new `on` argument. For example: `DT1[DT2, on=c(x = "y")]` would join column 'y' of `DT2` with 'x' of `DT1`. `DT1[DT2, on="y"]` would join on column 'y' on both data.tables. Closes [#1130](https://github.com/Rdatatable/data.table/issues/1130) partly.

22. `merge.data.table` gains arguments `by.x` and `by.y`. Closes [#637](https://github.com/Rdatatable/data.table/issues/637) and [#1130](https://github.com/Rdatatable/data.table/issues/1130). No copies are made even when the specified columns aren't key columns in data.tables, and therefore much more fast and memory efficient. Thanks to @blasern for the initial PRs.

26. data.tables can join now without having to set keys by using the new `on` argument. For example: `DT1[DT2, on=c(x = "y")]` would join column 'y' of `DT2` with 'x' of `DT1`. `DT1[DT2, on="y"]` would join on column 'y' on both data.tables. Closes [#1130](https://github.com/Rdatatable/data.table/issues/1130) partly.

#### BUG FIXES

1. `if (TRUE) DT[,LHS:=RHS]` no longer prints, [#869](https://github.com/Rdatatable/data.table/issues/869) and [#1122](https://github.com/Rdatatable/data.table/issues/1122). Tests added. To get this to work we've had to live with one downside: if a `:=` is used inside a function with no `DT[]` before the end of the function, then the next time `DT` or `print(DT)` is typed at the prompt, nothing will be printed. A repeated `DT` or `print(DT)` will print. To avoid this: include a `DT[]` after the last `:=` in your function. If that is not possible (e.g., it's not a function you can change) then `DT[]` at the prompt is guaranteed to print. As before, adding an extra `[]` on the end of a `:=` query is a recommended idiom to update and then print; e.g. `> DT[,foo:=3L][]`. Thanks to Jureiss and Jan Gorecki for reporting.
Expand Down
23 changes: 23 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -6665,6 +6665,29 @@ if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not
unlink(f)
}

# rest of #1130 - merge doesn't copy, instead uses joins without keys.
set.seed(1L)
d1 <- data.table(A = sample(letters[1:10]), X = 1:10, total = TRUE)
d2 <- data.table(A = sample(letters[5:14]), Y = 1:10, total = FALSE)

ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by="A"))
ans2 <- setDF(merge(setDT(d1), setDT(d2), by="A"))
test(1543.1, ans1, ans2)
ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), all=TRUE, by="A"))
ans2 <- setDF(merge(setDT(d1), setDT(d2), all=TRUE, by="A"))
test(1542.2, ans1, ans2)
# test duplicate name cases
setnames(d2, c("A", "Y"), c("B", "A"))
ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by.x="A", by.y="B"))
ans2 <- setDF(merge(setDT(d1), setDT(d2), by.x="A", by.y="B"))
test(1543.3, ans1, ans2)
ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), by.x="B", by.y="A"))
ans2 <- setDF(merge(setDT(d2), setDT(d1), by.x="B", by.y="A"))
test(1543.4, ans1, ans2)
ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), all=TRUE, by.x="B", by.y="A"))
ans2 <- setDF(merge(setDT(d2), setDT(d1), all=TRUE, by.x="B", by.y="A"))
test(1543.5, ans1, ans2)

##########################


Expand Down
49 changes: 27 additions & 22 deletions man/merge.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
\alias{merge.data.table}
\title{ Merge Two Data Tables }
\description{
Relatively quick merge of two \code{data.table}s based on common key columns (by default).
Fast merge of two \code{data.table}s.

This \code{merge} method for \code{data.table} is meant to act very similarly to the
\code{merge} method for \code{data.frame}, with the major exception being that
the default columns used to merge two \code{data.table} inputs are the shared key columns
rather than the shared columns with the same names.
This \code{merge} method for \code{data.table} behaves very similarly to that
of \code{data.frame}s with one major exception: By default,
the columns used to merge the \code{data.table}s are the shared key columns
rather than the shared columns with the same names. Set the \code{by}, or \code{by.x},
\code{by.y} arguments explicitly to override this default.

}

Expand Down Expand Up @@ -71,23 +72,15 @@ allow.cartesian=getOption("datatable.allow.cartesian"), # default FALSE
\details{
\code{\link{merge}} is a generic function in base R. It dispatches to either the
\code{merge.data.frame} method or \code{merge.data.table} method depending on the class of its first argument.
Typing \code{?merge} at the prompt should present a choice of two links:
the help pages for each of these \code{merge} methods. You don't need to use the full name of the
method although you may if you wish; i.e., \code{merge(DT1,DT2)} is idiomatic R but you can bypass
method dispatch by going direct if you wish: \code{merge.data.table(DT1,DT2)}.

Note that if the specified columns in \code{by} is not the key (or
head of the key) of \code{x} or \code{y}, then a copy is first rekeyed
prior to performing the merge. This might make this function perform
slower than you are expecting. When secondary keys are implemented in
future we expect performance in this case to improve.

For a more \code{data.table}-centric (and faster) way of merging two \code{data.table}s,
see \code{\link{[.data.table}}; e.g., \code{x[y, ...]}. In recent
versions, however, \code{merge()} is much closer to the speed of \code{x[y, ...]}.
See FAQ 1.12 for a detailed comparison of \code{merge} and \code{x[y, ...]}.

Columns of numeric types (i.e., double) have their last two bytes rounded off while computing order, by defalult, to avoid any unexpected behaviour due to limitations in representing floating point numbers precisely. For large numbers (integers > 2^31), we recommend using \code{bit64::integer64}. Have a look at \code{\link{setNumericRounding}} to learn more.
In versions \code{< v1.9.6}, if the specified columns in \code{by} was not the key (or head of the key) of \code{x} or \code{y}, then a \code{\link{copy}} is first rekeyed prior to performing the merge. This was less performant and memory inefficient.
In version \code{v1.9.4} secondary keys was implemented. In \code{v1.9.6}, the concept of secondary keys has been
extended to \code{merge}. No deep copies are made anymore and therefore very performant and memory efficient. Also there is better control for providing the columns to merge on with the help of newly implemented \code{by.x} and \code{by.y} arguments.
For a more \code{data.table}-centric way of merging two \code{data.table}s, see \code{\link{[.data.table}}; e.g., \code{x[y, ...]}. See FAQ 1.12 for a detailed comparison of \code{merge} and \code{x[y, ...]}.
Merges on numeric columns: Columns of numeric types (i.e., double) have their last two bytes rounded off while computing order, by defalult, to avoid any unexpected behaviour due to limitations in representing floating point numbers precisely. For large numbers (integers > 2^31), we recommend using \code{bit64::integer64}. Have a look at \code{\link{setNumericRounding}} to learn more.
}
Expand Down Expand Up @@ -138,6 +131,18 @@ merge(d4, d1)
merge(d1, d4, all=TRUE)
merge(d4, d1, all=TRUE)
# new feature, no need to set keys anymore
set.seed(1L)
d1 <- data.table(a=sample(rep(1:3,each=2)), z=1:6)
d2 <- data.table(a=2:0, z=10:12)
merge(d1, d2, by="a")
merge(d1, d2, by="a", all=TRUE)
# new feature, using by.x and by.y arguments
setnames(d2, "a", "b")
merge(d1, d2, by.x="a", by.y="b")
merge(d1, d2, by.x="a", by.y="b", all=TRUE)
merge(d2, d1, by.x="b", by.y="a")
}
\keyword{ data }
Expand Down

0 comments on commit 44b1e00

Please sign in to comment.