programming on data.table (#4304)

Rdatatable · May 10, 2021 · 79c3d3e · 79c3d3e
1 parent cecf528
commit 79c3d3e
Show file tree

Hide file tree

Showing 14 changed files with 1,274 additions and 12 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -32,6 +32,7 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^\.idea$
+^\.libs$
 
 ^.*\.dll$
 

diff --git a/NAMESPACE b/NAMESPACE
@@ -56,6 +56,7 @@ export(nafill)
 export(setnafill)
 export(.Last.updated)
 export(fcoalesce)
+export(substitute2)
 
 S3method("[", data.table)
 S3method("[<-", data.table)

diff --git a/NEWS.md b/NEWS.md
@@ -58,6 +58,35 @@
 
 9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage for reporting, and to @tdhock for implementing.
 
+10. A new interface for _programming on data.table_ has been added, [#2655](https://github.com/Rdatatable/data.table/issues/2655) any many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing.
+
+    ```R
+    DT = data.table(x = 1:5, y = 5:1)
+
+    # parameters
+    in_col_name = "x"
+    fun = "sum"
+    fun_arg1 = "na.rm"
+    fun_arg1val = TRUE
+    out_col_name = "sum_x"
+
+    # parameterized query
+    #DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val))]
+
+    # desired query
+    DT[, .(sum_x = sum(x, na.rm=TRUE))]
+
+    # new interface
+    DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)),
+      env = list(
+        in_col_name = "x",
+        fun = "sum",
+        fun_arg1 = "na.rm",
+        fun_arg1val = TRUE,
+        out_col_name = "sum_x"
+      )]
+    ```
+
 ## BUG FIXES
 
 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries.

diff --git a/R/data.table.R b/R/data.table.R
@@ -121,7 +121,7 @@ replace_dot_alias = function(e) {
   }
 }
 
-"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL)
+"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL)
 {
   # ..selfcount <<- ..selfcount+1  # in dev, we check no self calls, each of which doubles overhead, or could
   # test explicitly if the caller is [.data.table (even stronger test. TO DO.)
@@ -151,15 +151,19 @@ replace_dot_alias = function(e) {
     keyby = FALSE
   } else {
     if (missing(by)) {
-      by = bysub = substitute(keyby)
+      by = bysub = if (is.null(env)) substitute(keyby)
+                                else eval(substitute(substitute2(.keyby, env), list(.keyby = substitute(keyby))))
       keyby = TRUE
     } else {
-      by = bysub = substitute(by)
+      by = bysub = if (is.null(env)) substitute(by)
+                               else  eval(substitute(substitute2(.by, env), list(.by = substitute(by))))
       if (missing(keyby))
         keyby = FALSE
       else if (!isTRUEorFALSE(keyby))
         stop("When by and keyby are both provided, keyby must be TRUE or FALSE")
     }
+    if (missing(by)) { missingby=TRUE; by=bysub=NULL }  # possible when env is used, PR#4304
+    else if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="")
   }
   bynull = !missingby && is.null(by) #3530
   byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI"
@@ -215,7 +219,16 @@ replace_dot_alias = function(e) {
   av = NULL
   jsub = NULL
   if (!missing(j)) {
-    jsub = replace_dot_alias(substitute(j))
+    if (is.null(env)) jsub = substitute(j) else {
+      jsub = eval(substitute(
+        substitute2(.j, env),
+        list(.j = substitute(j))
+      ))
+      if (missing(jsub)) {j = substitute(); jsub=NULL} else if (verbose) cat("Argument 'j'  after substitute: ", paste(deparse(jsub, width.cutoff=500L), collapse=" "), "\n", sep="")
+    }
+  }
+  if (!missing(j)) {
+    jsub = replace_dot_alias(jsub)
     root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else ""
     if (root == ":" ||
         (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') ||
@@ -291,10 +304,18 @@ replace_dot_alias = function(e) {
 
   # setdiff removes duplicate entries, which'll create issues with duplicated names. Use %chin% instead.
   dupdiff = function(x, y) x[!x %chin% y]
-
+  isub = NULL
+  if (!missing(i)) {
+    if (is.null(env)) isub = substitute(i) else {
+      isub = eval(substitute(
+        substitute2(.i, env),
+        list(.i = substitute(i))
+      ))
+      if (missing(isub)) {i = substitute(); isub=NULL} else if (verbose) cat("Argument 'i'  after substitute: ", paste(deparse(isub, width.cutoff=500L), collapse=" "), "\n", sep="")
+    }
+  }
   if (!missing(i)) {
     xo = NULL
-    isub = substitute(i)
     if (identical(isub, NA)) {
       # only possibility *isub* can be NA (logical) is the symbol NA itself; i.e. DT[NA]
       # replace NA in this case with NA_integer_ as that's almost surely what user intended to

diff --git a/R/programming.R b/R/programming.R
@@ -0,0 +1,80 @@
+is.AsIs = function(x) {
+  inherits(x, "AsIs")
+}
+rm.AsIs = function(x) {
+  cl = oldClass(x)
+  oldClass(x) = cl[cl!="AsIs"]
+  x
+}
+list2lang = function(x) {
+  if (!is.list(x))
+    stop("'x' must be a list")
+  if (is.AsIs(x))
+    return(rm.AsIs(x))
+  asis = vapply(x, is.AsIs, FALSE)
+  char = vapply(x, is.character, FALSE)
+  to.name = !asis & char
+  if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience
+    if (any(non.scalar.char <- vapply(x[to.name], length, 0L)!=1L)) {
+      stop("Character objects provided in the input are not scalar objects, if you need them as character vector rather than a name, then wrap each into 'I' call: ",
+           paste(names(non.scalar.char)[non.scalar.char], collapse=", "))
+    }
+    x[to.name] = lapply(x[to.name], as.name)
+  }
+  if (isTRUE(getOption("datatable.enlist", TRUE))) { ## recursively enlist for nested lists, see note section in substitute2 manual
+    islt = vapply(x, is.list, FALSE)
+    to.enlist = !asis & islt
+    if (any(to.enlist)) {
+      x[to.enlist] = lapply(x[to.enlist], enlist)
+    }
+  }
+  if (any(asis)) {
+    x[asis] = lapply(x[asis], rm.AsIs)
+  }
+  x
+}
+enlist = function(x) {
+  if (!is.list(x))
+    stop("'x' must be a list")
+  if (is.AsIs(x))
+    return(rm.AsIs(x))
+  as.call(c(quote(list), list2lang(x)))
+}
+
+substitute2 = function(expr, env) {
+  if (missing(expr))
+    return(substitute())
+  if (missing(env)) {
+    stop("'env' must not be missing")
+  } else if (is.null(env)) {
+    # null is fine, will be escaped few lines below
+  } else if (is.environment(env)) {
+    env = as.list(env, all.names=TRUE, sorted=TRUE)
+  } else if (!is.list(env)) {
+    stop("'env' must be a list or an environment")
+  }
+  if (!length(env)) {
+    return(substitute(expr))
+  }
+  env.names = names(env)
+  if (is.null(env.names)) {
+    stop("'env' argument does not have names")
+  } else if (!all(nzchar(env.names))) {
+    stop("'env' argument has zero char names")
+  } else if (anyNA(env.names)) {
+    stop("'env' argument has NA names")
+  } else if (anyDuplicated(env.names)) {
+    stop("'env' argument has duplicated names")
+  }
+  # character to name/symbol, and list to list call
+  env = list2lang(env)
+  # R substitute
+  expr.sub = eval(substitute(
+    substitute(.expr, env),
+    env = list(.expr = substitute(expr))
+  ))
+  if (missing(expr.sub))
+    return(substitute()) ## nested emptiness
+  # substitute call argument names
+  .Call(Csubstitute_call_arg_namesR, expr.sub, env)
+}
diff --git a/R/test.data.table.R b/R/test.data.table.R
@@ -177,7 +177,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
   timings = env$timings
   DT = head(timings[-1L][order(-time)], 10L)   # exclude id 1 as in dev that includes JIT
   if ((x<-sum(timings[["nTest"]])) != ntest) {
-    warning("Timings count mismatch:",x,"vs",ntest)  # nocov
+    warning("Timings count mismatch: ",x," vs ",ntest)  # nocov
   }
   catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss))
   print(DT, class=FALSE)
@@ -260,6 +260,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
   # iv) if warning is supplied, y is checked to equal x, and x should result in a warning message matching the pattern
   # v) if output is supplied, x is evaluated and printed and the output is checked to match the pattern
   # num just needs to be numeric and unique. We normally increment integers at the end, but inserts can be made using decimals e.g. 10,11,11.1,11.2,12,13,...
+  # num=0 to escape global failure tracking so we can test behaviour of test function itself: test(1.1, test(0, TRUE, FALSE), FALSE, output="1 element mismatch")
   # Motivations:
   # 1) we'd like to know all tests that fail not just stop at the first. This often helps by revealing a common feature across a set of
   #    failing tests
@@ -273,7 +274,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
     prevtest = get("prevtest", parent.frame())
     nfail = get("nfail", parent.frame())   # to cater for both test.data.table() and stepping through tests in dev
     whichfail = get("whichfail", parent.frame())
-    assign("ntest", get("ntest", parent.frame()) + 1L, parent.frame(), inherits=TRUE)   # bump number of tests run
+    assign("ntest", get("ntest", parent.frame()) + if (num>0) 1L else 0L, parent.frame(), inherits=TRUE)   # bump number of tests run
     lasttime = get("lasttime", parent.frame())
     timings = get("timings", parent.frame())
     memtest = get("memtest", parent.frame())
@@ -282,7 +283,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
     foreign = get("foreign", parent.frame())
     showProgress = get("showProgress", parent.frame())
     time = nTest = NULL  # to avoid 'no visible binding' note
-    on.exit( {
+    if (num>0) on.exit( {
        now = proc.time()[3L]
        took = now-lasttime  # so that prep time between tests is attributed to the following test
        assign("lasttime", now, parent.frame(), inherits=TRUE)
@@ -344,7 +345,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
     fwrite(mem, "memtest.csv", append=TRUE, verbose=FALSE)                                                                             # nocov
   }
   fail = FALSE
-  if (.test.data.table) {
+  if (.test.data.table && num>0) {
     if (num<prevtest+0.0000005) {
       # nocov start
       catf("Test id %s is not in increasing order\n", numStr)
@@ -454,7 +455,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
     }
     # nocov end
   }
-  if (fail && .test.data.table) {
+  if (fail && .test.data.table && num>0) {
     # nocov start
     assign("nfail", nfail+1L, parent.frame(), inherits=TRUE)
     assign("whichfail", c(whichfail, numStr), parent.frame(), inherits=TRUE)