From de06ac6dcbfd3b52223a447c4dc21171c0a8fefa Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 19 Mar 2019 18:01:52 -0700 Subject: [PATCH] rbind and rbindlist recycle length-1 columns; #524 --- NEWS.md | 2 +- inst/tests/tests.Rraw | 18 ++++---- src/assign.c | 96 +++++++++++++++++++++++++++++++++++-------- src/data.table.h | 2 +- src/rbindlist.c | 94 +++++++----------------------------------- 5 files changed, 105 insertions(+), 107 deletions(-) diff --git a/NEWS.md b/NEWS.md index aa81e30bf1..fdad5363f4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -38,7 +38,7 @@ 12. `rbind` and `rbindlist(..., use.names=TRUE)` with over 255 columns could return the columns in a random order, [#3373](https://github.com/Rdatatable/data.table/issues/3373). The contents and name of each column was correct but the order that the columns appeared in the result might not match the original input. -13. `rbind` and `rbindlist` now combine `integer64` columns together with non-`integer64` columns correctly [#1349](https://github.com/Rdatatable/data.table/issues/1349), and support `raw` columns [#2819](https://github.com/Rdatatable/data.table/issues/2819). +13. `rbind` and `rbindlist` now combine `integer64` columns together with non-`integer64` columns correctly [#1349](https://github.com/Rdatatable/data.table/issues/1349), support `raw` columns [#2819](https://github.com/Rdatatable/data.table/issues/2819), and recycle length-1 columns [#524](https://github.com/Rdatatable/data.table/issues/524). #### NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 558ada6887..0580ae25a6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1775,13 +1775,11 @@ t = as.ITime(strptime(c("09:10:00","09:11:00","09:11:00","09:12:00"),"%H:%M:%S") test(626, unique(t), t[c(1,2,4)]) test(627, class(unique(t)), "ITime") -# Test recycling list() rbind - with recent C-level changes, this seems not possible (like rbindlist) -# old test commented. -# test(628, rbind(data.table(a=1:3,b=5:7,c=list(1:2,1:3,1:4)), list(4L,8L,as.list(1:3))), -# data.table(a=c(1:3,rep(4L,3L)),b=c(5:7,rep(8L,3L)),c=list(1:2,1:3,1:4,1L,2L,3L))) -test(628, rbind(data.table(a=1:3,b=5:7,c=list(1:2,1:3,1:4)), list(4L,8L,as.list(1:3))), error = "inconsistent with the first column of that item which is length") +# Test recycling list() rbind; #524. This was commented out until v1.12.2 when it was reinstated in PR#3455 +test(628.1, rbind(data.table(a=1:3,b=5:7,c=list(1:2,1:3,1:4)), list(4L,8L,as.list(1:3))), + data.table(a=c(1:3,rep(4L,3L)),b=c(5:7,rep(8L,3L)),c=list(1:2,1:3,1:4,1L,2L,3L))) # Test switch in .rbind.data.table for factor columns -test(628.5, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list(4L,factor("d"),factor("bar"))), +test(628.2, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list(4L,factor("d"),factor("bar"))), data.table(a=1:4,b=factor(letters[1:4]),c=factor(c(rep("foo",3),"bar"), levels = c("foo", "bar")))) # Test merge with common names and all.y=TRUE, #2011 @@ -1933,7 +1931,7 @@ l = list(data.table(a=1:2, b=7:8), test(676, rbindlist(l[1:3]), data.table(a=1:6,b=7:12)) test(677, rbindlist(l[c(10,1,10,2,10)]), data.table(a=1:4,b=7:10)) # NULL items ignored test(678, rbindlist(l[c(1,4)]), error="Item 2 has 1 columns, inconsistent with item 1 which has 2") -test(679, rbindlist(l[c(1:2,5)]), error="Column 2 of item 3 is length 1 inconsistent with the first column of that item which is length 2.") +test(679, rbindlist(l[c(1:2,5)]), data.table(a=c(1:4,15:16), b=c(7:10,17L,17L))) test(680, rbindlist(l[c(2,6)]), data.table(a=c(3,4,18,19), V2=c(9:10,20:21))) # coerces 18 and 19 to numeric test(681, rbindlist(list(data.table(a=letters[1:2],b=c(1.2,1.3),c=1:2), list("c",1.4,3L), NULL, list(letters[4:6],c(1.5,1.6,1.7),4:6))), data.table(a=letters[1:6], b=seq(1.2,1.7,by=0.1), c=1:6)) @@ -13669,8 +13667,8 @@ test(2002.4, rbindlist( list(list(a=1L,z=list()), list(a=2L, z=list("m"))) ), warning="Column 2 ['z'] of item 1 is length 0. This (and 0 others like it) has been filled with NA") test(2002.5, rbindlist( list( list(a=1L, z=list("z")), list(a=2L, z=list(c("a","b"))) )), data.table(a=1:2, z=list("z", c("a","b")))) -test(2002.6, rbindlist( list( list(a=1, z=list("z",1)), list(a=2, z=list(c("a","b"))) )), - error="Column 2 of item 1 is length 2 inconsistent with the first column of that item which is length 1") +test(2002.6, rbindlist( list( list(a=1:2, z=list("z",1,"k")), list(a=2, z=list(c("a","b"))) )), + error="Column 1 of item 1 is length 2 inconsistent with column 2 which is length 3. Only length-1 columns are recycled.") test(2002.7, rbindlist( list(list(a=1L, z=list(list())), list(a=2L, z=list(list("m")))) ), data.table(a=1:2, z=list(list(),list("m")))) test(2002.8, rbindlist( list(list(a=1L, z=list(list("z"))), list(a=2L, z=list(list(c("a","b"))))) ), @@ -13722,7 +13720,7 @@ test(2006, rbindlist(list(data.table(x = as.raw(1), y=as.raw(3)), data.table(x = if (test_bit64) { test(2007.1, rbindlist(list( list(a=as.integer64(1), b=3L), list(a=2L, b=4L) )), data.table(a=as.integer64(1:2), b=3:4)) test(2007.2, rbindlist(list( list(a=3.4, b=5L), list(a=as.integer64(4), b=6L) )), data.table(a=as.integer64(3:4), b=5:6), - warning="Column 1 of item 1 is being coerced to integer64 but contains a non-integer value [(]3.40.* at position 1[)]. Precision lost.") + warning="Column 1 of item 1: coerced to integer64 but contains a non-integer value [(]3.40.* at position 1[)]; precision lost") test(2007.3, rbindlist(list( list(a=3.0, b=5L), list(a=as.integer64(4), b=6L) )), data.table(a=as.integer64(3:4), b=5:6)) test(2007.4, rbindlist(list( list(b=5:6), list(a=as.integer64(4), b=7L)), fill=TRUE), data.table(b=5:7, a=as.integer64(c(NA,NA,4)))) # tests writeNA of integer64 test(2007.5, rbindlist(list( list(a=INT(1,NA,-2)), list(a=as.integer64(c(3,NA))) )), data.table(a=as.integer64(c(1,NA,-2,3,NA)))) # int NAs combined with int64 NA diff --git a/src/assign.c b/src/assign.c index e177fc3ed4..7eeec99165 100644 --- a/src/assign.c +++ b/src/assign.c @@ -270,8 +270,6 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) { return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0])); } -void memrecycle(SEXP target, SEXP where, int r, int len, SEXP source); - SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP verb) { // For internal use only by := in [.data.table, and set() @@ -788,19 +786,20 @@ static bool anyNamed(SEXP x) { return false; } -void memrecycle(SEXP target, SEXP where, int start, int len, SEXP source) +static char memrecycle_message[1000]; + +const char *memrecycle(SEXP target, SEXP where, int start, int len, SEXP source) // like memcpy but recycles single-item source // 'where' a 1-based INTEGER vector subset of target to assign to, or NULL or integer() // assigns to target[start:start+len-1] or target[where[start:start+len-1]] where start is 0-based { - if (len<1) return; - if (TYPEOF(target) != TYPEOF(source)) error("Internal error: TYPEOF(target)['%s']!=TYPEOF(source)['%s']", type2char(TYPEOF(target)),type2char(TYPEOF(source))); // # nocov + if (len<1) return NULL; int slen = length(source); - if (slen==0) return; + if (slen==0) return NULL; if (slen>1 && slen!=len) error("Internal error: recycle length error not caught earlier. slen=%d len=%d", slen, len); // # nocov // Internal error because the column has already been added to the DT, so length mismatch should have been caught before adding the column. // for 5647 this used to limit slen to len, but no longer - + *memrecycle_message = '\0'; int protecti=0; if (isNewList(source)) { // A list() column; i.e. target is a column of pointers to SEXPs rather than the much more common case @@ -820,28 +819,89 @@ void memrecycle(SEXP target, SEXP where, int start, int len, SEXP source) protecti++; } } - if (!length(where)) { + if (!length(where)) { // e.g. called from rbindlist with where=R_NilValue switch (TYPEOF(target)) { - case LGLSXP: case INTSXP : + case RAWSXP: + if (TYPEOF(source)!=RAWSXP) { source = PROTECT(coerceVector(source, RAWSXP)); protecti++; } if (slen==1) { // recycle single items - int *td = INTEGER(target); + Rbyte *td = RAW(target)+start; + const Rbyte val = RAW(source)[0]; + for (int i=0; i0 checked above + eachMax[i] = 0; SEXP li = VECTOR_ELT(l, i); if (isNull(li)) continue; if (TYPEOF(li) != VECSXP) error("Item %d of input is not a data.frame, data.table or list", i+1); @@ -39,13 +40,15 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) int nNames = length(getAttrib(li, R_NamesSymbol)); if (nNames>0 && nNames!=thisncol) error("Item %d has %d columns but %d column names. Invalid object.", i+1, thisncol, nNames); if (nNames>0) anyNames=true; - int thisnrow = length(VECTOR_ELT(li,0)); - for (int j=1; jmaxLen) { maxLen=tt; whichMax=j; } } + for (int j=0; j0 && tt!=thisnrow) error("Column %d of item %d is length %d inconsistent with the first column of that item which is length %d. rbind/rbindlist expects each item in the input list to be a uniform list, data.frame or data.table", j+1, i+1, tt, thisnrow); - if (tt==0 && thisnrow>0 && numZero++==0) { firstZeroCol = j; firstZeroItem=i; } + if (tt>1 && tt!=maxLen) error("Column %d of item %d is length %d inconsistent with column %d which is length %d. Only length-1 columns are recycled.", j+1, i+1, tt, whichMax+1, maxLen); + if (tt==0 && maxLen>0 && numZero++==0) { firstZeroCol = j; firstZeroItem=i; } } - nrow += thisnrow; + eachMax[i] = maxLen; + nrow += maxLen; } if (numZero) { // #1871 SEXP names = getAttrib(VECTOR_ELT(l, firstZeroItem), R_NamesSymbol); @@ -248,10 +251,9 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) int nLevel=0, allocLevel=0; SEXP *levelsRaw = NULL; // growing list of SEXP pointers. Raw since managed with raw realloc. for (int i=0; i