diff --git a/NEWS.md b/NEWS.md index c9f997b63e..c9e1c367ad 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,6 +24,8 @@ 2. When `on=` is provided but not `i=`, a helpful error is now produced rather than silently ignoring `on=`. Thanks to Dirk Eddelbuettel for the idea. +3. `.SDcols=` is more helpful when passed non-existent columns, [#3116](https://github.com/Rdatatable/data.table/issues/3116) and [#3118](https://github.com/Rdatatable/data.table/issues/3118). Thanks to Michael Chirico for the investigation and PR. + ### Changes in v1.11.8 (30 Sep 2018) diff --git a/R/data.table.R b/R/data.table.R index e0b1a12d3b..d15a5f1712 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1017,18 +1017,22 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { } else { .SDcols = eval(colsub, parent.frame(), parent.frame()) } + if (anyNA(.SDcols)) + stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols)))) if (is.logical(.SDcols)) { ansvals = which_(rep(.SDcols, length.out=length(x)), !colm) ansvars = names(x)[ansvals] } else if (is.numeric(.SDcols)) { # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff' if (length(unique(sign(.SDcols))) != 1L) stop(".SDcols is numeric but has both +ve and -ve indices") - if (anyNA(.SDcols) || any(abs(.SDcols)>ncol(x)) || any(abs(.SDcols)<1L)) stop(".SDcols is numeric but out of bounds (or NA)") + if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L)) + stop(".SDcols is numeric but out of bounds [1, ", ncol(x), "] at: ", brackify(which(idx))) if (colm) ansvars = dupdiff(names(x)[-.SDcols], bynames) else ansvars = names(x)[.SDcols] ansvals = if (colm) setdiff(seq_along(names(x)), c(as.integer(.SDcols), which(names(x) %chin% bynames))) else as.integer(.SDcols) } else { if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names") - if (anyNA(.SDcols) || any(!.SDcols %chin% names(x))) stop("Some items of .SDcols are not column names (or are NA)") + if (!all(idx <- .SDcols %chin% names(x))) + stop("Some items of .SDcols are not column names: ", brackify(.SDcols[!idx])) if (colm) ansvars = setdiff(setdiff(names(x), .SDcols), bynames) else ansvars = .SDcols # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove) ansvals = chmatch(ansvars, names(x)) diff --git a/R/fread.R b/R/fread.R index 214bb866ae..30212e13b4 100644 --- a/R/fread.R +++ b/R/fread.R @@ -218,7 +218,7 @@ setfactor <- function(x, cols, verbose) { setattr(ans, 'class', 'factor') } if (length(cols)) { - if (verbose) cat("Converting column(s) [", paste(names(x)[cols], collapse = ", "), "] from 'char' to 'factor'\n", sep = "") + if (verbose) cat("Converting column(s) ", brackify(names(x)[cols]), " from 'char' to 'factor'\n", sep = "") for (j in cols) set(x, j = j, value = as_factor(.subset2(x, j))) } invisible(x) diff --git a/R/utils.R b/R/utils.R index 09c0dec0d8..01b9cdf2d9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -86,3 +86,10 @@ name_dots <- function(...) { list(vnames=vnames, novname=novname) } +# convert a vector like c(1, 4, 3, 2) into a string like [1, 4, 3, 2] +# (common aggregation method for error messages) +brackify = function(x) { + # arbitrary cutoff + if (length(x) > 10L) x = c(x[1:10], '...') + sprintf('[%s]', paste(x, collapse = ', ')) +} diff --git a/R/xts.R b/R/xts.R index 5807478bc3..df296d76c9 100644 --- a/R/xts.R +++ b/R/xts.R @@ -11,7 +11,7 @@ as.xts.data.table <- function(x, ...) { stopifnot(requireNamespace("xts"), !missing(x), is.data.table(x)) if (!xts::is.timeBased(x[[1L]])) stop("data.table must have a time based column in first position, use `setcolorder` function to change the order, or see ?timeBased for supported types") colsNumeric = vapply_1b(x, is.numeric)[-1L] # exclude first col, xts index - if (any(!colsNumeric)) warning(paste("Following columns are not numeric and will be omitted:", paste(names(colsNumeric)[!colsNumeric], collapse = ", "))) + if (any(!colsNumeric)) warning("Following columns are not numeric and will be omitted: ", brackify(names(colsNumeric)[!colsNumeric])) r = setDF(x[, .SD, .SDcols = names(colsNumeric)[colsNumeric]]) return(xts::as.xts(r, order.by = if ("IDate" %chin% class(x[[1L]])) as.Date(x[[1L]]) else x[[1L]])) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b427e022de..318e0fc943 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -19,7 +19,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # in the test suite when user runs test.data.table() from installed package AND ii) so that in dev the same # tests can be used but in dev they test the package in .GlobalEnv. If we used ::: throughout tests, that # would pick up the installed version and in dev you'd have to reinstall every time which slows down dev. - # NB: The string "data.table:::" should exist nowhere else in this file other than here inside this branch. + # NB: The string "data.table::" (which covers "data.table:::" too) should exist nowhere else in this file + # other than here inside this branch. test = data.table:::test INT = data.table:::INT @@ -50,11 +51,11 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { as.IDate.default = data.table:::as.IDate.default binary = data.table:::binary melt.data.table = data.table:::melt.data.table # for test 1953.4 + brackify = data.table:::brackify # Also, for functions that are masked by other packages, we need to map the data.table one. Or else, # the other package's function would be picked up. As above, we only need to do this because we desire # to develop in .GlobalEnv with cc(). - # NB: The string "data.table::" should exist nowhere else in this file other than here inside this branch. # This should be retained even if these packages are removed from Suggests, because the test() in this file # checks against a data.table result which needs the data.table one to run. Otherwise the user can be # sure by using :: themselves. @@ -6286,6 +6287,19 @@ if (test_xts) { dt.xts <- as.xts.data.table(dt) test(1663, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31"))) + # additional coverage missing uncovered in #3117 + dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) + xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) + test(1465.9, as.data.table(xt, keep.rownames = FALSE), dt[ , !'index']) + names(xt)[1L] = 'index' + test(1465.10, as.data.table(xt), error = 'Input xts object should not') + names(xt)[1L] = 'quantity' + setcolorder(dt, c(3, 1, 2)) + test(1465.11, as.xts(dt), error = 'data.table must have a time based') + setcolorder(dt, c(2, 3, 1)) + dt[ , char_col := 'a'] + test(1465.12, as.xts(dt), xt, warning = 'columns are not numeric') + Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) } @@ -12366,6 +12380,27 @@ test(1955.5, setnames(DT, "B", "bb", skip_absent=TRUE), data.table(aa=1, bb=2, d test(1955.6, setnames(DT, c("miss1","bb","miss2","dd"), c("A","B","C","D")), error="Items of 'old' not found in column names: miss1,miss2. Consider skip_absent=TRUE") test(1955.7, setnames(DT, c("miss1","bb","miss2","dd"), c("A","B","C","D"), skip_absent=TRUE), data.table(aa=1, B=2, D=3)) +# #3116 - Better error messages for missing/unmatched .SDcols +DT = data.table(a = 1:5) +test(1956.1, DT[, .SD, .SDcols = NA_character_], error = 'missing at the following') +test(1956.2, DT[, .SD, .SDcols = NA], error = 'missing at the following') +test(1956.3, DT[, .SD, .SDcols = NA_real_], error = 'missing at the following') +test(1956.4, DT[, .SD, .SDcols = 2L], error = 'out of bounds.*1.*1.*at') +test(1956.5, DT[, .SD, .SDcols = 'b'], error = 'not column names') +test(1956.6, DT[, .SD, .SDcols = 3i], error = '.SDcols should be column numbers or names') + +# added brackify to utils for #3116 +test(1957.1, brackify(1:3), '[1, 2, 3]') +test(1957.2, brackify(1:11), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ...]") +test(1957.3, fread("A,B\na,b\nc,d\n", stringsAsFactors=TRUE, verbose=TRUE), data.table(A=factor(c("a","c")), B=factor(c("b","d"))), + output="Converting column(s) [A, B] from 'char' to 'factor'") + +# misc. coverage tests in fread +test(1958.1, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encoding' must be") +test(1958.2, fread('a,b\n1,2', nrows = NA_real_), data.table(a = 1L, b = 2L)) +test(1958.3, fread('a,b\n1,2', nrows = -1), data.table(a = 1L, b = 2L)) +test(1958.4, fread('a,b\n1,2', key = 1), error = 'must be a character vector naming columns') + ################################### # Add new tests above this line #