Skip to content

Commit

Permalink
programming on data.table (#4304)
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki authored May 10, 2021
1 parent cecf528 commit 79c3d3e
Show file tree
Hide file tree
Showing 14 changed files with 1,274 additions and 12 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
^.*\.Rproj$
^\.Rproj\.user$
^\.idea$
^\.libs$

^.*\.dll$

Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ export(nafill)
export(setnafill)
export(.Last.updated)
export(fcoalesce)
export(substitute2)

S3method("[", data.table)
S3method("[<-", data.table)
Expand Down
29 changes: 29 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,35 @@

9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage for reporting, and to @tdhock for implementing.

10. A new interface for _programming on data.table_ has been added, [#2655](https://github.com/Rdatatable/data.table/issues/2655) any many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing.

```R
DT = data.table(x = 1:5, y = 5:1)

# parameters
in_col_name = "x"
fun = "sum"
fun_arg1 = "na.rm"
fun_arg1val = TRUE
out_col_name = "sum_x"

# parameterized query
#DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val))]

# desired query
DT[, .(sum_x = sum(x, na.rm=TRUE))]

# new interface
DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)),
env = list(
in_col_name = "x",
fun = "sum",
fun_arg1 = "na.rm",
fun_arg1val = TRUE,
out_col_name = "sum_x"
)]
```

## BUG FIXES

1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries.
Expand Down
33 changes: 27 additions & 6 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ replace_dot_alias = function(e) {
}
}

"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL)
"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL)
{
# ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could
# test explicitly if the caller is [.data.table (even stronger test. TO DO.)
Expand Down Expand Up @@ -151,15 +151,19 @@ replace_dot_alias = function(e) {
keyby = FALSE
} else {
if (missing(by)) {
by = bysub = substitute(keyby)
by = bysub = if (is.null(env)) substitute(keyby)
else eval(substitute(substitute2(.keyby, env), list(.keyby = substitute(keyby))))
keyby = TRUE
} else {
by = bysub = substitute(by)
by = bysub = if (is.null(env)) substitute(by)
else eval(substitute(substitute2(.by, env), list(.by = substitute(by))))
if (missing(keyby))
keyby = FALSE
else if (!isTRUEorFALSE(keyby))
stop("When by and keyby are both provided, keyby must be TRUE or FALSE")
}
if (missing(by)) { missingby=TRUE; by=bysub=NULL } # possible when env is used, PR#4304
else if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="")
}
bynull = !missingby && is.null(by) #3530
byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI"
Expand Down Expand Up @@ -215,7 +219,16 @@ replace_dot_alias = function(e) {
av = NULL
jsub = NULL
if (!missing(j)) {
jsub = replace_dot_alias(substitute(j))
if (is.null(env)) jsub = substitute(j) else {
jsub = eval(substitute(
substitute2(.j, env),
list(.j = substitute(j))
))
if (missing(jsub)) {j = substitute(); jsub=NULL} else if (verbose) cat("Argument 'j' after substitute: ", paste(deparse(jsub, width.cutoff=500L), collapse=" "), "\n", sep="")
}
}
if (!missing(j)) {
jsub = replace_dot_alias(jsub)
root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else ""
if (root == ":" ||
(root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') ||
Expand Down Expand Up @@ -291,10 +304,18 @@ replace_dot_alias = function(e) {

# setdiff removes duplicate entries, which'll create issues with duplicated names. Use %chin% instead.
dupdiff = function(x, y) x[!x %chin% y]

isub = NULL
if (!missing(i)) {
if (is.null(env)) isub = substitute(i) else {
isub = eval(substitute(
substitute2(.i, env),
list(.i = substitute(i))
))
if (missing(isub)) {i = substitute(); isub=NULL} else if (verbose) cat("Argument 'i' after substitute: ", paste(deparse(isub, width.cutoff=500L), collapse=" "), "\n", sep="")
}
}
if (!missing(i)) {
xo = NULL
isub = substitute(i)
if (identical(isub, NA)) {
# only possibility *isub* can be NA (logical) is the symbol NA itself; i.e. DT[NA]
# replace NA in this case with NA_integer_ as that's almost surely what user intended to
Expand Down
80 changes: 80 additions & 0 deletions R/programming.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
is.AsIs = function(x) {
inherits(x, "AsIs")
}
rm.AsIs = function(x) {
cl = oldClass(x)
oldClass(x) = cl[cl!="AsIs"]
x
}
list2lang = function(x) {
if (!is.list(x))
stop("'x' must be a list")
if (is.AsIs(x))
return(rm.AsIs(x))
asis = vapply(x, is.AsIs, FALSE)
char = vapply(x, is.character, FALSE)
to.name = !asis & char
if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience
if (any(non.scalar.char <- vapply(x[to.name], length, 0L)!=1L)) {
stop("Character objects provided in the input are not scalar objects, if you need them as character vector rather than a name, then wrap each into 'I' call: ",
paste(names(non.scalar.char)[non.scalar.char], collapse=", "))
}
x[to.name] = lapply(x[to.name], as.name)
}
if (isTRUE(getOption("datatable.enlist", TRUE))) { ## recursively enlist for nested lists, see note section in substitute2 manual
islt = vapply(x, is.list, FALSE)
to.enlist = !asis & islt
if (any(to.enlist)) {
x[to.enlist] = lapply(x[to.enlist], enlist)
}
}
if (any(asis)) {
x[asis] = lapply(x[asis], rm.AsIs)
}
x
}
enlist = function(x) {
if (!is.list(x))
stop("'x' must be a list")
if (is.AsIs(x))
return(rm.AsIs(x))
as.call(c(quote(list), list2lang(x)))
}

substitute2 = function(expr, env) {
if (missing(expr))
return(substitute())
if (missing(env)) {
stop("'env' must not be missing")
} else if (is.null(env)) {
# null is fine, will be escaped few lines below
} else if (is.environment(env)) {
env = as.list(env, all.names=TRUE, sorted=TRUE)
} else if (!is.list(env)) {
stop("'env' must be a list or an environment")
}
if (!length(env)) {
return(substitute(expr))
}
env.names = names(env)
if (is.null(env.names)) {
stop("'env' argument does not have names")
} else if (!all(nzchar(env.names))) {
stop("'env' argument has zero char names")
} else if (anyNA(env.names)) {
stop("'env' argument has NA names")
} else if (anyDuplicated(env.names)) {
stop("'env' argument has duplicated names")
}
# character to name/symbol, and list to list call
env = list2lang(env)
# R substitute
expr.sub = eval(substitute(
substitute(.expr, env),
env = list(.expr = substitute(expr))
))
if (missing(expr.sub))
return(substitute()) ## nested emptiness
# substitute call argument names
.Call(Csubstitute_call_arg_namesR, expr.sub, env)
}
11 changes: 6 additions & 5 deletions R/test.data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
timings = env$timings
DT = head(timings[-1L][order(-time)], 10L) # exclude id 1 as in dev that includes JIT
if ((x<-sum(timings[["nTest"]])) != ntest) {
warning("Timings count mismatch:",x,"vs",ntest) # nocov
warning("Timings count mismatch: ",x," vs ",ntest) # nocov
}
catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss))
print(DT, class=FALSE)
Expand Down Expand Up @@ -260,6 +260,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
# iv) if warning is supplied, y is checked to equal x, and x should result in a warning message matching the pattern
# v) if output is supplied, x is evaluated and printed and the output is checked to match the pattern
# num just needs to be numeric and unique. We normally increment integers at the end, but inserts can be made using decimals e.g. 10,11,11.1,11.2,12,13,...
# num=0 to escape global failure tracking so we can test behaviour of test function itself: test(1.1, test(0, TRUE, FALSE), FALSE, output="1 element mismatch")
# Motivations:
# 1) we'd like to know all tests that fail not just stop at the first. This often helps by revealing a common feature across a set of
# failing tests
Expand All @@ -273,7 +274,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
prevtest = get("prevtest", parent.frame())
nfail = get("nfail", parent.frame()) # to cater for both test.data.table() and stepping through tests in dev
whichfail = get("whichfail", parent.frame())
assign("ntest", get("ntest", parent.frame()) + 1L, parent.frame(), inherits=TRUE) # bump number of tests run
assign("ntest", get("ntest", parent.frame()) + if (num>0) 1L else 0L, parent.frame(), inherits=TRUE) # bump number of tests run
lasttime = get("lasttime", parent.frame())
timings = get("timings", parent.frame())
memtest = get("memtest", parent.frame())
Expand All @@ -282,7 +283,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
foreign = get("foreign", parent.frame())
showProgress = get("showProgress", parent.frame())
time = nTest = NULL # to avoid 'no visible binding' note
on.exit( {
if (num>0) on.exit( {
now = proc.time()[3L]
took = now-lasttime # so that prep time between tests is attributed to the following test
assign("lasttime", now, parent.frame(), inherits=TRUE)
Expand Down Expand Up @@ -344,7 +345,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
fwrite(mem, "memtest.csv", append=TRUE, verbose=FALSE) # nocov
}
fail = FALSE
if (.test.data.table) {
if (.test.data.table && num>0) {
if (num<prevtest+0.0000005) {
# nocov start
catf("Test id %s is not in increasing order\n", numStr)
Expand Down Expand Up @@ -454,7 +455,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
}
# nocov end
}
if (fail && .test.data.table) {
if (fail && .test.data.table && num>0) {
# nocov start
assign("nfail", nfail+1L, parent.frame(), inherits=TRUE)
assign("whichfail", c(whichfail, numStr), parent.frame(), inherits=TRUE)
Expand Down
Loading

0 comments on commit 79c3d3e

Please sign in to comment.