Skip to content

Commit

Permalink
rbind and rbindlist recycle length-1 columns; #524
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdowle committed Mar 20, 2019
1 parent 5936dfb commit de06ac6
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 107 deletions.
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

12. `rbind` and `rbindlist(..., use.names=TRUE)` with over 255 columns could return the columns in a random order, [#3373](https://github.com/Rdatatable/data.table/issues/3373). The contents and name of each column was correct but the order that the columns appeared in the result might not match the original input.

13. `rbind` and `rbindlist` now combine `integer64` columns together with non-`integer64` columns correctly [#1349](https://github.com/Rdatatable/data.table/issues/1349), and support `raw` columns [#2819](https://github.com/Rdatatable/data.table/issues/2819).
13. `rbind` and `rbindlist` now combine `integer64` columns together with non-`integer64` columns correctly [#1349](https://github.com/Rdatatable/data.table/issues/1349), support `raw` columns [#2819](https://github.com/Rdatatable/data.table/issues/2819), and recycle length-1 columns [#524](https://github.com/Rdatatable/data.table/issues/524).

#### NOTES

Expand Down
18 changes: 8 additions & 10 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -1775,13 +1775,11 @@ t = as.ITime(strptime(c("09:10:00","09:11:00","09:11:00","09:12:00"),"%H:%M:%S")
test(626, unique(t), t[c(1,2,4)])
test(627, class(unique(t)), "ITime")

# Test recycling list() rbind - with recent C-level changes, this seems not possible (like rbindlist)
# old test commented.
# test(628, rbind(data.table(a=1:3,b=5:7,c=list(1:2,1:3,1:4)), list(4L,8L,as.list(1:3))),
# data.table(a=c(1:3,rep(4L,3L)),b=c(5:7,rep(8L,3L)),c=list(1:2,1:3,1:4,1L,2L,3L)))
test(628, rbind(data.table(a=1:3,b=5:7,c=list(1:2,1:3,1:4)), list(4L,8L,as.list(1:3))), error = "inconsistent with the first column of that item which is length")
# Test recycling list() rbind; #524. This was commented out until v1.12.2 when it was reinstated in PR#3455
test(628.1, rbind(data.table(a=1:3,b=5:7,c=list(1:2,1:3,1:4)), list(4L,8L,as.list(1:3))),
data.table(a=c(1:3,rep(4L,3L)),b=c(5:7,rep(8L,3L)),c=list(1:2,1:3,1:4,1L,2L,3L)))
# Test switch in .rbind.data.table for factor columns
test(628.5, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list(4L,factor("d"),factor("bar"))),
test(628.2, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list(4L,factor("d"),factor("bar"))),
data.table(a=1:4,b=factor(letters[1:4]),c=factor(c(rep("foo",3),"bar"), levels = c("foo", "bar"))))

# Test merge with common names and all.y=TRUE, #2011
Expand Down Expand Up @@ -1933,7 +1931,7 @@ l = list(data.table(a=1:2, b=7:8),
test(676, rbindlist(l[1:3]), data.table(a=1:6,b=7:12))
test(677, rbindlist(l[c(10,1,10,2,10)]), data.table(a=1:4,b=7:10)) # NULL items ignored
test(678, rbindlist(l[c(1,4)]), error="Item 2 has 1 columns, inconsistent with item 1 which has 2")
test(679, rbindlist(l[c(1:2,5)]), error="Column 2 of item 3 is length 1 inconsistent with the first column of that item which is length 2.")
test(679, rbindlist(l[c(1:2,5)]), data.table(a=c(1:4,15:16), b=c(7:10,17L,17L)))
test(680, rbindlist(l[c(2,6)]), data.table(a=c(3,4,18,19), V2=c(9:10,20:21))) # coerces 18 and 19 to numeric
test(681, rbindlist(list(data.table(a=letters[1:2],b=c(1.2,1.3),c=1:2), list("c",1.4,3L), NULL, list(letters[4:6],c(1.5,1.6,1.7),4:6))),
data.table(a=letters[1:6], b=seq(1.2,1.7,by=0.1), c=1:6))
Expand Down Expand Up @@ -13669,8 +13667,8 @@ test(2002.4, rbindlist( list(list(a=1L,z=list()), list(a=2L, z=list("m"))) ),
warning="Column 2 ['z'] of item 1 is length 0. This (and 0 others like it) has been filled with NA")
test(2002.5, rbindlist( list( list(a=1L, z=list("z")), list(a=2L, z=list(c("a","b"))) )),
data.table(a=1:2, z=list("z", c("a","b"))))
test(2002.6, rbindlist( list( list(a=1, z=list("z",1)), list(a=2, z=list(c("a","b"))) )),
error="Column 2 of item 1 is length 2 inconsistent with the first column of that item which is length 1")
test(2002.6, rbindlist( list( list(a=1:2, z=list("z",1,"k")), list(a=2, z=list(c("a","b"))) )),
error="Column 1 of item 1 is length 2 inconsistent with column 2 which is length 3. Only length-1 columns are recycled.")
test(2002.7, rbindlist( list(list(a=1L, z=list(list())), list(a=2L, z=list(list("m")))) ),
data.table(a=1:2, z=list(list(),list("m"))))
test(2002.8, rbindlist( list(list(a=1L, z=list(list("z"))), list(a=2L, z=list(list(c("a","b"))))) ),
Expand Down Expand Up @@ -13722,7 +13720,7 @@ test(2006, rbindlist(list(data.table(x = as.raw(1), y=as.raw(3)), data.table(x =
if (test_bit64) {
test(2007.1, rbindlist(list( list(a=as.integer64(1), b=3L), list(a=2L, b=4L) )), data.table(a=as.integer64(1:2), b=3:4))
test(2007.2, rbindlist(list( list(a=3.4, b=5L), list(a=as.integer64(4), b=6L) )), data.table(a=as.integer64(3:4), b=5:6),
warning="Column 1 of item 1 is being coerced to integer64 but contains a non-integer value [(]3.40.* at position 1[)]. Precision lost.")
warning="Column 1 of item 1: coerced to integer64 but contains a non-integer value [(]3.40.* at position 1[)]; precision lost")
test(2007.3, rbindlist(list( list(a=3.0, b=5L), list(a=as.integer64(4), b=6L) )), data.table(a=as.integer64(3:4), b=5:6))
test(2007.4, rbindlist(list( list(b=5:6), list(a=as.integer64(4), b=7L)), fill=TRUE), data.table(b=5:7, a=as.integer64(c(NA,NA,4)))) # tests writeNA of integer64
test(2007.5, rbindlist(list( list(a=INT(1,NA,-2)), list(a=as.integer64(c(3,NA))) )), data.table(a=as.integer64(c(1,NA,-2,3,NA)))) # int NAs combined with int64 NA
Expand Down
96 changes: 80 additions & 16 deletions src/assign.c
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,6 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0]));
}

void memrecycle(SEXP target, SEXP where, int r, int len, SEXP source);

SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP verb)
{
// For internal use only by := in [.data.table, and set()
Expand Down Expand Up @@ -788,19 +786,20 @@ static bool anyNamed(SEXP x) {
return false;
}

void memrecycle(SEXP target, SEXP where, int start, int len, SEXP source)
static char memrecycle_message[1000];

const char *memrecycle(SEXP target, SEXP where, int start, int len, SEXP source)
// like memcpy but recycles single-item source
// 'where' a 1-based INTEGER vector subset of target to assign to, or NULL or integer()
// assigns to target[start:start+len-1] or target[where[start:start+len-1]] where start is 0-based
{
if (len<1) return;
if (TYPEOF(target) != TYPEOF(source)) error("Internal error: TYPEOF(target)['%s']!=TYPEOF(source)['%s']", type2char(TYPEOF(target)),type2char(TYPEOF(source))); // # nocov
if (len<1) return NULL;
int slen = length(source);
if (slen==0) return;
if (slen==0) return NULL;
if (slen>1 && slen!=len) error("Internal error: recycle length error not caught earlier. slen=%d len=%d", slen, len); // # nocov
// Internal error because the column has already been added to the DT, so length mismatch should have been caught before adding the column.
// for 5647 this used to limit slen to len, but no longer

*memrecycle_message = '\0';
int protecti=0;
if (isNewList(source)) {
// A list() column; i.e. target is a column of pointers to SEXPs rather than the much more common case
Expand All @@ -820,28 +819,89 @@ void memrecycle(SEXP target, SEXP where, int start, int len, SEXP source)
protecti++;
}
}
if (!length(where)) {
if (!length(where)) { // e.g. called from rbindlist with where=R_NilValue
switch (TYPEOF(target)) {
case LGLSXP: case INTSXP :
case RAWSXP:
if (TYPEOF(source)!=RAWSXP) { source = PROTECT(coerceVector(source, RAWSXP)); protecti++; }
if (slen==1) {
// recycle single items
int *td = INTEGER(target);
Rbyte *td = RAW(target)+start;
const Rbyte val = RAW(source)[0];
for (int i=0; i<len; i++) td[i] = val; // no R API inside loop as RAW()/INTEGER() etc have overhead even when inline functions
} else {
memcpy(RAW(target)+start, RAW(source), slen*SIZEOF(target));
}
break;
case LGLSXP: case INTSXP :
if (TYPEOF(source)!=LGLSXP && TYPEOF(source)!=INTSXP) { source = PROTECT(coerceVector(source, TYPEOF(target))); protecti++; }
if (slen==1) {
int *td = INTEGER(target)+start;
const int val = INTEGER(source)[0];
for (int i=0; i<len; i++) td[start+i] = val; // no R API inside loop as INTEGER has overhead (even when it's an inline function)
for (int i=0; i<len; i++) td[i] = val;
} else {
memcpy(INTEGER(target)+start, INTEGER(source), slen*SIZEOF(target));
}
break;
case REALSXP :
case REALSXP : {
bool si64 = INHERITS(source, char_integer64);
bool ti64 = INHERITS(target, char_integer64);
if (si64 && TYPEOF(source)!=REALSXP)
error("Internal error: source has integer64 attribute but is type '%s' not REALSXP", type2char(TYPEOF(source))); // # nocov
if (si64 == ti64) {
if (TYPEOF(source)!=REALSXP) { source = PROTECT(coerceVector(source, REALSXP)); protecti++; }
if (slen==1) {
double *td = REAL(target)+start;
const double val = REAL(source)[0];
for (int i=0; i<len; i++) td[i] = val;
} else {
memcpy(REAL(target)+start, REAL(source), slen*SIZEOF(target));
}
} else if (si64) {
double *td = REAL(target)+start;
if (slen==1) {
const double val = (double)(((int64_t *)REAL(source))[0]);
for (int i=0; i<len; i++) td[i] = val;
} else {
const int64_t *val = (int64_t *)REAL(source);
for (int i=0; i<len; i++) td[i] = (double)(val[i]);
}
} else {
int64_t *td = (int64_t *)REAL(target)+start;
const int mask = slen==1 ? 0 : INT_MAX;
switch (TYPEOF(source)) {
case RAWSXP: {
const Rbyte *sd = RAW(source); // sd = source data
for (int i=0; i<len; ++i) td[i] = (int64_t)(sd[i&mask]); // raw has no NA
} break;
case LGLSXP : case INTSXP : {
const int *sd = INTEGER(source);
for (int i=0; i<len; ++i) td[i] = sd[i]==NA_INTEGER ? INT64_MIN : (int64_t)(sd[i]);
} break;
case REALSXP : {
int firstReal=0;
if ((firstReal=INTEGER(isReallyReal(source))[0])) {
sprintf(memrecycle_message, "coerced to integer64 but contains a non-integer value (%f at position %d); precision lost.", REAL(source)[firstReal-1], firstReal);
}
double *sd = REAL(source);
for (int i=0; i<len; ++i) td[i] = R_FINITE(sd[i]) ? (int)(sd[i]) : NA_INTEGER;
} break;
default :
error("Internal error: memrecycle integer64 column source is type '%s'", type2char(TYPEOF(source))); // # nocov
}
}
} break;
case CPLXSXP :
if (TYPEOF(source)!=CPLXSXP) { source = PROTECT(coerceVector(source, CPLXSXP)); protecti++; }
if (slen==1) {
double *td = REAL(target);
const double val = REAL(source)[0];
for (int i=0; i<len; i++) td[start+i] = val;
Rcomplex *td = COMPLEX(target)+start;
const Rcomplex val = COMPLEX(source)[0];
for (int i=0; i<len; ++i) td[i] = val;
} else {
memcpy(REAL(target)+start, REAL(source), slen*SIZEOF(target));
memcpy(COMPLEX(target)+start, COMPLEX(source), slen*SIZEOF(target));
}
break;
case STRSXP :
if (TYPEOF(source)!=STRSXP) { source = PROTECT(coerceVector(source, STRSXP)); protecti++; }
if (slen==1) {
const SEXP val = STRING_ELT(source, 0);
for (int i=0; i<len; i++) SET_STRING_ELT(target, start+i, val);
Expand All @@ -851,6 +911,7 @@ void memrecycle(SEXP target, SEXP where, int start, int len, SEXP source)
}
break;
case VECSXP :
if (TYPEOF(source)!=VECSXP) { source = PROTECT(coerceVector(source, VECSXP)); protecti++; }
if (slen==1) {
const SEXP val = VECTOR_ELT(source, 0);
for (int i=0; i<len; i++) SET_VECTOR_ELT(target, start+i, val);
Expand All @@ -863,6 +924,8 @@ void memrecycle(SEXP target, SEXP where, int start, int len, SEXP source)
error("Unsupported type in assign.c:memrecycle '%s' (no where)", type2char(TYPEOF(target))); // # nocov
}
} else {
if (TYPEOF(target) != TYPEOF(source))
error("Internal error: TYPEOF(target)['%s']!=TYPEOF(source)['%s'] in memrecycle (where)", type2char(TYPEOF(target)),type2char(TYPEOF(source))); // # nocov
const int *wd = INTEGER(where)+start;
const int mask = slen==1 ? 0 : INT_MAX;
switch (TYPEOF(target)) {
Expand Down Expand Up @@ -905,6 +968,7 @@ void memrecycle(SEXP target, SEXP where, int start, int len, SEXP source)
}
}
UNPROTECT(protecti);
return memrecycle_message[0] ? memrecycle_message : NULL;
}

void writeNA(SEXP v, const int from, const int n)
Expand Down
2 changes: 1 addition & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ SEXP dt_na(SEXP x, SEXP cols);

// assign.c
SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose);
void memrecycle(SEXP target, SEXP where, int r, int len, SEXP source);
const char *memrecycle(SEXP target, SEXP where, int r, int len, SEXP source);
SEXP shallowwrapper(SEXP dt, SEXP cols);

SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols,
Expand Down
94 changes: 15 additions & 79 deletions src/rbindlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
}
const bool idcol = !isNull(idcolArg);
if (idcol && (!isString(idcolArg) || LENGTH(idcolArg)!=1)) error("Internal error: rbindlist.c idcol is not a single string"); // # nocov

int ncol=0, first=0;
int64_t nrow=0;
bool anyNames=false;
int numZero=0, firstZeroCol=0, firstZeroItem=0;
int *eachMax = (int *)R_alloc(LENGTH(l), sizeof(int));
// pre-check for any errors here to save having to get cleanup right below when usenames
for (int i=0; i<LENGTH(l); i++) { // length(l)>0 checked above
eachMax[i] = 0;
SEXP li = VECTOR_ELT(l, i);
if (isNull(li)) continue;
if (TYPEOF(li) != VECSXP) error("Item %d of input is not a data.frame, data.table or list", i+1);
Expand All @@ -39,13 +40,15 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
int nNames = length(getAttrib(li, R_NamesSymbol));
if (nNames>0 && nNames!=thisncol) error("Item %d has %d columns but %d column names. Invalid object.", i+1, thisncol, nNames);
if (nNames>0) anyNames=true;
int thisnrow = length(VECTOR_ELT(li,0));
for (int j=1; j<thisncol; ++j) {
int maxLen=0, whichMax=0;
for (int j=0; j<thisncol; ++j) { int tt=length(VECTOR_ELT(li,j)); if (tt>maxLen) { maxLen=tt; whichMax=j; } }
for (int j=0; j<thisncol; ++j) {
int tt = length(VECTOR_ELT(li, j));
if (tt>0 && tt!=thisnrow) error("Column %d of item %d is length %d inconsistent with the first column of that item which is length %d. rbind/rbindlist expects each item in the input list to be a uniform list, data.frame or data.table", j+1, i+1, tt, thisnrow);
if (tt==0 && thisnrow>0 && numZero++==0) { firstZeroCol = j; firstZeroItem=i; }
if (tt>1 && tt!=maxLen) error("Column %d of item %d is length %d inconsistent with column %d which is length %d. Only length-1 columns are recycled.", j+1, i+1, tt, whichMax+1, maxLen);
if (tt==0 && maxLen>0 && numZero++==0) { firstZeroCol = j; firstZeroItem=i; }
}
nrow += thisnrow;
eachMax[i] = maxLen;
nrow += maxLen;
}
if (numZero) { // #1871
SEXP names = getAttrib(VECTOR_ELT(l, firstZeroItem), R_NamesSymbol);
Expand Down Expand Up @@ -248,10 +251,9 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
int nLevel=0, allocLevel=0;
SEXP *levelsRaw = NULL; // growing list of SEXP pointers. Raw since managed with raw realloc.
for (int i=0; i<LENGTH(l); ++i) {
SEXP li = VECTOR_ELT(l, i);
if (!length(li)) continue;
const int thisnrow = length(VECTOR_ELT(li, 0));
const int thisnrow = eachMax[i];
if (thisnrow==0) continue;
SEXP li = VECTOR_ELT(l, i);
int w = usenames ? colMap[i*ncol + j] : j;
SEXP thisCol;
if (w==-1 || !length(thisCol=VECTOR_ELT(li, w))) { // !length for zeroCol warning above; #1871
Expand Down Expand Up @@ -320,82 +322,16 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
setAttrib(target, R_ClassSymbol, ScalarString(char_factor));
} else {
for (int i=0; i<LENGTH(l); ++i) {
SEXP li = VECTOR_ELT(l, i);
if (!length(li)) continue;
const int thisnrow = length(VECTOR_ELT(li, 0));
const int thisnrow = eachMax[i];
if (thisnrow==0) continue;
SEXP li = VECTOR_ELT(l, i);
int w = usenames ? colMap[i*ncol + j] : j;
SEXP thisCol;
if (w==-1 || !length(thisCol=VECTOR_ELT(li, w))) {
writeNA(target, ansloc, thisnrow); // writeNA is integer64 aware and writes INT64_MIN
} else {
bool coerced = false;
if (!int64 && TYPEOF(thisCol)!=TYPEOF(target)) {
thisCol = PROTECT(coerceVector(thisCol, TYPEOF(target)));
coerced = true;
// TO DO: options(datatable.pedantic=TRUE) to issue this warning :
// warning("Column %d of item %d is type '%s', inconsistent with column %d of item %d's type ('%s')",j+1,i+1,type2char(TYPEOF(thiscol)),j+1,first+1,type2char(TYPEOF(target)));
}
switch(TYPEOF(target)) {
case RAWSXP:
memcpy(RAW(target)+ansloc, RAW(thisCol), thisnrow*SIZEOF(thisCol));
break;
case LGLSXP:
memcpy(LOGICAL(target)+ansloc, LOGICAL(thisCol), thisnrow*SIZEOF(thisCol));
break;
case INTSXP:
memcpy(INTEGER(target)+ansloc, INTEGER(thisCol), thisnrow*SIZEOF(thisCol));
break;
case REALSXP:
if (!int64) {
memcpy(REAL(target)+ansloc, REAL(thisCol), thisnrow*SIZEOF(thisCol));
} else {
int64_t *td = (int64_t *)REAL(target) + ansloc; // td = target data
switch (TYPEOF(thisCol)) {
case RAWSXP: {
const Rbyte *sd = RAW(thisCol); // sd = source data
for (int r=0; r<thisnrow; ++r)
td[r] = (int64_t)(sd[r]); // raw has no NA
} break;
case LGLSXP : case INTSXP : {
const int *sd = INTEGER(thisCol);
for (int r=0; r<thisnrow; ++r)
td[r] = sd[r]==NA_INTEGER ? INT64_MIN : (int64_t)(sd[r]);
} break;
case REALSXP :
if (INHERITS(thisCol, char_integer64)) {
memcpy(td, REAL(thisCol), thisnrow*SIZEOF(thisCol));
} else {
int firstReal=0;
if ((firstReal=INTEGER(isReallyReal(thisCol))[0])) {
warning("Column %d of item %d is being coerced to integer64 but contains a non-integer value (%f at position %d). Precision lost.",
w+1, i+1, REAL(thisCol)[firstReal-1], firstReal);
}
double *sd = REAL(thisCol);
for (int r=0; r<thisnrow; ++r)
td[r] = R_FINITE(sd[r]) ? (int)(sd[r]) : NA_INTEGER;
}
break;
default :
error("Internal error: integer64 column has an element of type '%s'", type2char(TYPEOF(thisCol))); // # nocov
}
}
break;
case CPLXSXP :
memcpy(COMPLEX(target)+ansloc, COMPLEX(thisCol), thisnrow*sizeof(Rcomplex));
break;
case VECSXP :
for (int r=0; r<thisnrow; ++r)
SET_VECTOR_ELT(target, ansloc+r, VECTOR_ELT(thisCol,r));
break;
case STRSXP :
for (int r=0; r<thisnrow; ++r)
SET_STRING_ELT(target, ansloc+r, STRING_ELT(thisCol,r));
break;
default :
error("Unsupported column type '%s'", type2char(TYPEOF(target))); // # nocov
}
if (coerced) UNPROTECT(1);
const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol);
if (ret) warning("Column %d of item %d: %s", w+1, i+1, ret); // currently just one warning when precision is lost; e.g. assigning 3.4 to integer64
}
ansloc += thisnrow;
}
Expand Down

0 comments on commit de06ac6

Please sign in to comment.