diff --git a/NEWS.md b/NEWS.md index acefbbf4e8..dff623278e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,6 +28,8 @@ 3. Optimized `shift` per group produced wrong results when simultaneously subsetting, for example, `DT[i==1L, shift(x), by=group]`, [#5962](https://github.com/Rdatatable/data.table/issues/5962). Thanks to @renkun-ken for the report and Benjamin Schwendinger for the fix. +4. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings (for example, when fun.aggregate=min or max, warning was NAs introduced by coercion to integer range) which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512), [#5390](https://github.com/Rdatatable/data.table/issues/5390). Thanks to Toby Dylan Hocking for the fix. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/fcast.R b/R/fcast.R index bb59d8409b..7c0766cfef 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -152,7 +152,6 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., dat = .Call(CsubsetDT, dat, idx, seq_along(dat)) } fun.call = m[["fun.aggregate"]] - fill.default = NULL if (is.null(fun.call)) { oo = forderv(dat, by=varnames, retGrp=TRUE) if (attr(oo, 'maxgrpn', exact=TRUE) > 1L) { @@ -160,15 +159,15 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., fun.call = quote(length) } } - if (!is.null(fun.call)) { + dat_for_default_fill = dat + run_agg_funs = !is.null(fun.call) + if (run_agg_funs) { fun.call = aggregate_funs(fun.call, lvals, sep, ...) - errmsg = gettext("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") - if (is.null(fill)) { - fill.default = suppressWarnings(dat[0L][, eval(fun.call)]) - # tryCatch(fill.default <- dat[0L][, eval(fun.call)], error = function(x) stopf(errmsg)) - if (nrow(fill.default) != 1L) stopf(errmsg) + maybe_err = function(list.of.columns) { + if (any(lengths(list.of.columns) != 1L)) stopf("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") + list.of.columns } - dat = dat[, eval(fun.call), by=c(varnames)] + dat = dat[, maybe_err(eval(fun.call)), by=c(varnames)] } order_ = function(x) { o = forderv(x, retGrp=TRUE, sort=TRUE) @@ -211,7 +210,12 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., } maplen = vapply_1i(mapunique, length) idx = do.call("CJ", mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join. - ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call)) + some_fill = anyNA(idx) + fill.default = if (run_agg_funs && is.null(fill) && some_fill) dat_for_default_fill[, maybe_err(eval(fun.call))] + if (run_agg_funs && is.null(fill) && some_fill) { + fill.default = dat_for_default_fill[0L][, maybe_err(eval(fun.call))] + } + ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call), some_fill) allcols = do.call("paste", c(rhs, sep=sep)) if (length(valnames) > 1L) allcols = do.call("paste", if (identical(".", allcols)) list(valnames, sep=sep) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1b507112ce..4c06fac21f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3729,6 +3729,21 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2],20,TRUE), d1=runif(20), d2=1L) test(1102.38, names(dcast(DT, x ~ y + z, fun.aggregate=length, value.var = "d2", sep=".")), c("x", "1.a", "1.b", "2.a", "2.b")) + + # test for #5512, only compute default fill if needed. + DT = data.table(chr=c("a","b","b"), int=1:3, dbl=as.double(4:6)) + mymin <- function(x){ + if (!length(x)) stop("calling mymin on vector of length 0") + min(x) + } + test(1102.39, dcast(DT, . ~ chr, mymin, value.var="int"), data.table(.=".",a=1L,b=2L,key=".")) # fill not used in output, so default fill not computed. + ans <- data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int") + test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning=c("no non-missing arguments to min; returning Inf", "inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")) # warning emitted when coercing default fill since as.integer(min(integer()) is Inf) is NA. + test(1102.41, dcast(DT, int ~ chr, mymin, value.var="int", fill=NA), ans) # because fill=NA is provided by user, no need to call mymin(integer()). + test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int"), warning="no non-missing arguments to min; returning Inf") # only one warning, because no coercion. + test(1102.43, dcast(DT, int ~ chr, min, value.var="dbl", fill="coerced to NA"), data.table(int=1:3, a=c(4,NA,NA), b=c(NA,5,6), key="int"), warning=c("Coercing 'character' RHS to 'double' to match the type of target vector.", "NAs introduced by coercion")) + test(1102.44, dcast(DT, int ~ ., value.var="dbl", fill="ignored"), data.table(int=1:3, .=c(4,5,6), key="int")) + } # test for freading commands diff --git a/man/dcast.data.table.Rd b/man/dcast.data.table.Rd index 2aa265a96c..8d35c199d5 100644 --- a/man/dcast.data.table.Rd +++ b/man/dcast.data.table.Rd @@ -22,7 +22,7 @@ \item{\dots}{Any other arguments that may be passed to the aggregating function.} \item{margins}{Not implemented yet. Should take variable names to compute margins on. A value of \code{TRUE} would compute all margins.} \item{subset}{Specified if casting should be done on a subset of the data. Ex: \code{subset = .(col1 <= 5)} or \code{subset = .(variable != "January")}.} - \item{fill}{Value with which to fill missing cells. If \code{fun.aggregate} is present, takes the value by applying the function on a 0-length vector.} + \item{fill}{Value with which to fill missing cells. If \code{fill=NULL} and missing cells are present, then \code{fun.aggregate} is used on a 0-length vector to obtain a fill value.} \item{drop}{\code{FALSE} will cast by including all missing combinations. \code{c(FALSE, TRUE)} will only include all missing combinations of formula \code{LHS}; \code{c(TRUE, FALSE)} will only include all missing combinations of formula RHS. See Examples.} diff --git a/src/data.table.h b/src/data.table.h index 0a6eb207a8..da82af7be2 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -289,7 +289,7 @@ SEXP setlistelt(SEXP, SEXP, SEXP); SEXP address(SEXP); SEXP expandAltRep(SEXP); SEXP fmelt(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP fcast(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP fcast(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP issorted(SEXP, SEXP); SEXP gforce(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP gsum(SEXP, SEXP); diff --git a/src/fcast.c b/src/fcast.c index 8c49c6fe28..d049711bf5 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -4,7 +4,7 @@ // raise(SIGINT); // TO DO: margins -SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fill, SEXP fill_d, SEXP is_agg) { +SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fill, SEXP fill_d, SEXP is_agg, SEXP some_fillArg) { int nrows=INTEGER(nrowArg)[0], ncols=INTEGER(ncolArg)[0]; int nlhs=length(lhs), nval=length(val), *idx = INTEGER(idxArg); SEXP target; @@ -15,24 +15,28 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil SET_VECTOR_ELT(ans, i, VECTOR_ELT(lhs, i)); } // get val cols + bool some_fill = LOGICAL(some_fillArg)[0]; for (int i=0; i