Improves diagnostic messaging of .SDcols when passed bad data (#3117)

Rdatatable · Nov 14, 2018 · 55168ed · 55168ed
1 parent 092fec3
commit 55168ed
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 6 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -24,6 +24,8 @@
 
 2. When `on=` is provided but not `i=`, a helpful error is now produced rather than silently ignoring `on=`. Thanks to Dirk Eddelbuettel for the idea.
 
+3. `.SDcols=` is more helpful when passed non-existent columns, [#3116](https://github.com/Rdatatable/data.table/issues/3116) and [#3118](https://github.com/Rdatatable/data.table/issues/3118). Thanks to Michael Chirico for the investigation and PR.
+
 
 ### Changes in v1.11.8  (30 Sep 2018)
 

diff --git a/R/data.table.R b/R/data.table.R
@@ -1017,18 +1017,22 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
           } else {
             .SDcols = eval(colsub, parent.frame(), parent.frame())
           }
+          if (anyNA(.SDcols))
+            stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols))))
           if (is.logical(.SDcols)) {
             ansvals = which_(rep(.SDcols, length.out=length(x)), !colm)
             ansvars = names(x)[ansvals]
           } else if (is.numeric(.SDcols)) {
             # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff'
             if (length(unique(sign(.SDcols))) != 1L) stop(".SDcols is numeric but has both +ve and -ve indices")
-            if (anyNA(.SDcols) || any(abs(.SDcols)>ncol(x)) || any(abs(.SDcols)<1L)) stop(".SDcols is numeric but out of bounds (or NA)")
+            if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L))
+              stop(".SDcols is numeric but out of bounds [1, ", ncol(x), "] at: ", brackify(which(idx)))
             if (colm) ansvars = dupdiff(names(x)[-.SDcols], bynames) else ansvars = names(x)[.SDcols]
             ansvals = if (colm) setdiff(seq_along(names(x)), c(as.integer(.SDcols), which(names(x) %chin% bynames))) else as.integer(.SDcols)
           } else {
             if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names")
-            if (anyNA(.SDcols) || any(!.SDcols %chin% names(x))) stop("Some items of .SDcols are not column names (or are NA)")
+            if (!all(idx <- .SDcols %chin% names(x)))
+              stop("Some items of .SDcols are not column names: ", brackify(.SDcols[!idx]))
             if (colm) ansvars = setdiff(setdiff(names(x), .SDcols), bynames) else ansvars = .SDcols
             # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
             ansvals = chmatch(ansvars, names(x))

diff --git a/R/fread.R b/R/fread.R
@@ -218,7 +218,7 @@ setfactor <- function(x, cols, verbose) {
     setattr(ans, 'class', 'factor')
   }
   if (length(cols)) {
-    if (verbose) cat("Converting column(s) [", paste(names(x)[cols], collapse = ", "), "] from 'char' to 'factor'\n", sep = "")
+    if (verbose) cat("Converting column(s) ", brackify(names(x)[cols]), " from 'char' to 'factor'\n", sep = "")
     for (j in cols) set(x, j = j, value = as_factor(.subset2(x, j)))
   }
   invisible(x)

diff --git a/R/utils.R b/R/utils.R
@@ -86,3 +86,10 @@ name_dots <- function(...) {
   list(vnames=vnames, novname=novname)
 }
 
+# convert a vector like c(1, 4, 3, 2) into a string like [1, 4, 3, 2]
+#   (common aggregation method for error messages)
+brackify = function(x) {
+  # arbitrary cutoff
+  if (length(x) > 10L) x = c(x[1:10], '...')
+  sprintf('[%s]', paste(x, collapse = ', '))
+}
diff --git a/R/xts.R b/R/xts.R
@@ -11,7 +11,7 @@ as.xts.data.table <- function(x, ...) {
   stopifnot(requireNamespace("xts"), !missing(x), is.data.table(x))
   if (!xts::is.timeBased(x[[1L]])) stop("data.table must have a time based column in first position, use `setcolorder` function to change the order, or see ?timeBased for supported types")
   colsNumeric = vapply_1b(x, is.numeric)[-1L] # exclude first col, xts index
-  if (any(!colsNumeric)) warning(paste("Following columns are not numeric and will be omitted:", paste(names(colsNumeric)[!colsNumeric], collapse = ", ")))
+  if (any(!colsNumeric)) warning("Following columns are not numeric and will be omitted: ", brackify(names(colsNumeric)[!colsNumeric]))
   r = setDF(x[, .SD, .SDcols = names(colsNumeric)[colsNumeric]])
   return(xts::as.xts(r, order.by = if ("IDate" %chin% class(x[[1L]])) as.Date(x[[1L]]) else x[[1L]]))
 }
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -19,7 +19,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   # in the test suite when user runs test.data.table() from installed package AND ii) so that in dev the same
   # tests can be used but in dev they test the package in .GlobalEnv. If we used ::: throughout tests, that
   # would pick up the installed version and in dev you'd have to reinstall every time which slows down dev.
-  # NB: The string "data.table:::" should exist nowhere else in this file other than here inside this branch.
+  # NB: The string "data.table::" (which covers "data.table:::" too) should exist nowhere else in this file
+  #      other than here inside this branch.
 
   test = data.table:::test
   INT = data.table:::INT
@@ -50,11 +51,11 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   as.IDate.default = data.table:::as.IDate.default
   binary = data.table:::binary
   melt.data.table = data.table:::melt.data.table  # for test 1953.4
+  brackify = data.table:::brackify
 
   # Also, for functions that are masked by other packages, we need to map the data.table one. Or else,
   # the other package's function would be picked up. As above, we only need to do this because we desire
   # to develop in .GlobalEnv with cc().
-  # NB: The string "data.table::" should exist nowhere else in this file other than here inside this branch.
   # This should be retained even if these packages are removed from Suggests, because the test() in this file
   # checks against a data.table result which needs the data.table one to run. Otherwise the user can be
   # sure by using :: themselves.
@@ -6286,6 +6287,19 @@ if (test_xts) {
   dt.xts <- as.xts.data.table(dt)
   test(1663, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31")))
 
+  # additional coverage missing uncovered in #3117
+  dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5))
+  xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index)
+  test(1465.9, as.data.table(xt, keep.rownames = FALSE), dt[ , !'index'])
+  names(xt)[1L] = 'index'
+  test(1465.10, as.data.table(xt), error = 'Input xts object should not')
+  names(xt)[1L] = 'quantity'
+  setcolorder(dt, c(3, 1, 2))
+  test(1465.11, as.xts(dt), error = 'data.table must have a time based')
+  setcolorder(dt, c(2, 3, 1))
+  dt[ , char_col := 'a']
+  test(1465.12, as.xts(dt), xt, warning = 'columns are not numeric')
+
   Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE)
 }
 
@@ -12366,6 +12380,27 @@ test(1955.5, setnames(DT, "B", "bb", skip_absent=TRUE), data.table(aa=1, bb=2, d
 test(1955.6, setnames(DT, c("miss1","bb","miss2","dd"), c("A","B","C","D")), error="Items of 'old' not found in column names: miss1,miss2. Consider skip_absent=TRUE")
 test(1955.7, setnames(DT, c("miss1","bb","miss2","dd"), c("A","B","C","D"), skip_absent=TRUE), data.table(aa=1, B=2, D=3))
 
+# #3116 - Better error messages for missing/unmatched .SDcols
+DT = data.table(a = 1:5)
+test(1956.1, DT[, .SD, .SDcols = NA_character_], error = 'missing at the following')
+test(1956.2, DT[, .SD, .SDcols = NA], error = 'missing at the following')
+test(1956.3, DT[, .SD, .SDcols = NA_real_], error = 'missing at the following')
+test(1956.4, DT[, .SD, .SDcols = 2L], error = 'out of bounds.*1.*1.*at')
+test(1956.5, DT[, .SD, .SDcols = 'b'], error = 'not column names')
+test(1956.6, DT[, .SD, .SDcols = 3i], error = '.SDcols should be column numbers or names')
+
+# added brackify to utils for #3116
+test(1957.1, brackify(1:3), '[1, 2, 3]')
+test(1957.2, brackify(1:11), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ...]")
+test(1957.3, fread("A,B\na,b\nc,d\n", stringsAsFactors=TRUE, verbose=TRUE), data.table(A=factor(c("a","c")), B=factor(c("b","d"))),
+             output="Converting column(s) [A, B] from 'char' to 'factor'")
+
+# misc. coverage tests in fread
+test(1958.1, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encoding' must be")
+test(1958.2, fread('a,b\n1,2', nrows = NA_real_),      data.table(a = 1L, b = 2L))
+test(1958.3, fread('a,b\n1,2', nrows = -1),            data.table(a = 1L, b = 2L))
+test(1958.4, fread('a,b\n1,2', key = 1),               error = 'must be a character vector naming columns')
+
 
 ###################################
 #  Add new tests above this line  #
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,6 +24,8 @@

		2. When `on=` is provided but not `i=`, a helpful error is now produced rather than silently ignoring `on=`. Thanks to Dirk Eddelbuettel for the idea.

		3. `.SDcols=` is more helpful when passed non-existent columns, [#3116](https://github.com/Rdatatable/data.table/issues/3116) and [#3118](https://github.com/Rdatatable/data.table/issues/3118). Thanks to Michael Chirico for the investigation and PR.


		### Changes in v1.11.8 (30 Sep 2018)

Expand Down