Skip to content

Commit

Permalink
Improves diagnostic messaging of .SDcols when passed bad data (#3117)
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelChirico authored and mattdowle committed Nov 14, 2018
1 parent 092fec3 commit 55168ed
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 6 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

2. When `on=` is provided but not `i=`, a helpful error is now produced rather than silently ignoring `on=`. Thanks to Dirk Eddelbuettel for the idea.

3. `.SDcols=` is more helpful when passed non-existent columns, [#3116](https://github.com/Rdatatable/data.table/issues/3116) and [#3118](https://github.com/Rdatatable/data.table/issues/3118). Thanks to Michael Chirico for the investigation and PR.


### Changes in v1.11.8 (30 Sep 2018)

Expand Down
8 changes: 6 additions & 2 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1017,18 +1017,22 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
} else {
.SDcols = eval(colsub, parent.frame(), parent.frame())
}
if (anyNA(.SDcols))
stop(".SDcols missing at the following indices: ", brackify(which(is.na(.SDcols))))
if (is.logical(.SDcols)) {
ansvals = which_(rep(.SDcols, length.out=length(x)), !colm)
ansvars = names(x)[ansvals]
} else if (is.numeric(.SDcols)) {
# if .SDcols is numeric, use 'dupdiff' instead of 'setdiff'
if (length(unique(sign(.SDcols))) != 1L) stop(".SDcols is numeric but has both +ve and -ve indices")
if (anyNA(.SDcols) || any(abs(.SDcols)>ncol(x)) || any(abs(.SDcols)<1L)) stop(".SDcols is numeric but out of bounds (or NA)")
if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L))
stop(".SDcols is numeric but out of bounds [1, ", ncol(x), "] at: ", brackify(which(idx)))
if (colm) ansvars = dupdiff(names(x)[-.SDcols], bynames) else ansvars = names(x)[.SDcols]
ansvals = if (colm) setdiff(seq_along(names(x)), c(as.integer(.SDcols), which(names(x) %chin% bynames))) else as.integer(.SDcols)
} else {
if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names")
if (anyNA(.SDcols) || any(!.SDcols %chin% names(x))) stop("Some items of .SDcols are not column names (or are NA)")
if (!all(idx <- .SDcols %chin% names(x)))
stop("Some items of .SDcols are not column names: ", brackify(.SDcols[!idx]))
if (colm) ansvars = setdiff(setdiff(names(x), .SDcols), bynames) else ansvars = .SDcols
# dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
ansvals = chmatch(ansvars, names(x))
Expand Down
2 changes: 1 addition & 1 deletion R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ setfactor <- function(x, cols, verbose) {
setattr(ans, 'class', 'factor')
}
if (length(cols)) {
if (verbose) cat("Converting column(s) [", paste(names(x)[cols], collapse = ", "), "] from 'char' to 'factor'\n", sep = "")
if (verbose) cat("Converting column(s) ", brackify(names(x)[cols]), " from 'char' to 'factor'\n", sep = "")
for (j in cols) set(x, j = j, value = as_factor(.subset2(x, j)))
}
invisible(x)
Expand Down
7 changes: 7 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,10 @@ name_dots <- function(...) {
list(vnames=vnames, novname=novname)
}

# convert a vector like c(1, 4, 3, 2) into a string like [1, 4, 3, 2]
# (common aggregation method for error messages)
brackify = function(x) {
# arbitrary cutoff
if (length(x) > 10L) x = c(x[1:10], '...')
sprintf('[%s]', paste(x, collapse = ', '))
}
2 changes: 1 addition & 1 deletion R/xts.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ as.xts.data.table <- function(x, ...) {
stopifnot(requireNamespace("xts"), !missing(x), is.data.table(x))
if (!xts::is.timeBased(x[[1L]])) stop("data.table must have a time based column in first position, use `setcolorder` function to change the order, or see ?timeBased for supported types")
colsNumeric = vapply_1b(x, is.numeric)[-1L] # exclude first col, xts index
if (any(!colsNumeric)) warning(paste("Following columns are not numeric and will be omitted:", paste(names(colsNumeric)[!colsNumeric], collapse = ", ")))
if (any(!colsNumeric)) warning("Following columns are not numeric and will be omitted: ", brackify(names(colsNumeric)[!colsNumeric]))
r = setDF(x[, .SD, .SDcols = names(colsNumeric)[colsNumeric]])
return(xts::as.xts(r, order.by = if ("IDate" %chin% class(x[[1L]])) as.Date(x[[1L]]) else x[[1L]]))
}
39 changes: 37 additions & 2 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
# in the test suite when user runs test.data.table() from installed package AND ii) so that in dev the same
# tests can be used but in dev they test the package in .GlobalEnv. If we used ::: throughout tests, that
# would pick up the installed version and in dev you'd have to reinstall every time which slows down dev.
# NB: The string "data.table:::" should exist nowhere else in this file other than here inside this branch.
# NB: The string "data.table::" (which covers "data.table:::" too) should exist nowhere else in this file
# other than here inside this branch.

test = data.table:::test
INT = data.table:::INT
Expand Down Expand Up @@ -50,11 +51,11 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
as.IDate.default = data.table:::as.IDate.default
binary = data.table:::binary
melt.data.table = data.table:::melt.data.table # for test 1953.4
brackify = data.table:::brackify

# Also, for functions that are masked by other packages, we need to map the data.table one. Or else,
# the other package's function would be picked up. As above, we only need to do this because we desire
# to develop in .GlobalEnv with cc().
# NB: The string "data.table::" should exist nowhere else in this file other than here inside this branch.
# This should be retained even if these packages are removed from Suggests, because the test() in this file
# checks against a data.table result which needs the data.table one to run. Otherwise the user can be
# sure by using :: themselves.
Expand Down Expand Up @@ -6286,6 +6287,19 @@ if (test_xts) {
dt.xts <- as.xts.data.table(dt)
test(1663, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31")))

# additional coverage missing uncovered in #3117
dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5))
xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index)
test(1465.9, as.data.table(xt, keep.rownames = FALSE), dt[ , !'index'])
names(xt)[1L] = 'index'
test(1465.10, as.data.table(xt), error = 'Input xts object should not')
names(xt)[1L] = 'quantity'
setcolorder(dt, c(3, 1, 2))
test(1465.11, as.xts(dt), error = 'data.table must have a time based')
setcolorder(dt, c(2, 3, 1))
dt[ , char_col := 'a']
test(1465.12, as.xts(dt), xt, warning = 'columns are not numeric')

Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE)
}

Expand Down Expand Up @@ -12366,6 +12380,27 @@ test(1955.5, setnames(DT, "B", "bb", skip_absent=TRUE), data.table(aa=1, bb=2, d
test(1955.6, setnames(DT, c("miss1","bb","miss2","dd"), c("A","B","C","D")), error="Items of 'old' not found in column names: miss1,miss2. Consider skip_absent=TRUE")
test(1955.7, setnames(DT, c("miss1","bb","miss2","dd"), c("A","B","C","D"), skip_absent=TRUE), data.table(aa=1, B=2, D=3))

# #3116 - Better error messages for missing/unmatched .SDcols
DT = data.table(a = 1:5)
test(1956.1, DT[, .SD, .SDcols = NA_character_], error = 'missing at the following')
test(1956.2, DT[, .SD, .SDcols = NA], error = 'missing at the following')
test(1956.3, DT[, .SD, .SDcols = NA_real_], error = 'missing at the following')
test(1956.4, DT[, .SD, .SDcols = 2L], error = 'out of bounds.*1.*1.*at')
test(1956.5, DT[, .SD, .SDcols = 'b'], error = 'not column names')
test(1956.6, DT[, .SD, .SDcols = 3i], error = '.SDcols should be column numbers or names')

# added brackify to utils for #3116
test(1957.1, brackify(1:3), '[1, 2, 3]')
test(1957.2, brackify(1:11), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ...]")
test(1957.3, fread("A,B\na,b\nc,d\n", stringsAsFactors=TRUE, verbose=TRUE), data.table(A=factor(c("a","c")), B=factor(c("b","d"))),
output="Converting column(s) [A, B] from 'char' to 'factor'")

# misc. coverage tests in fread
test(1958.1, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encoding' must be")
test(1958.2, fread('a,b\n1,2', nrows = NA_real_), data.table(a = 1L, b = 2L))
test(1958.3, fread('a,b\n1,2', nrows = -1), data.table(a = 1L, b = 2L))
test(1958.4, fread('a,b\n1,2', key = 1), error = 'must be a character vector naming columns')


###################################
# Add new tests above this line #
Expand Down

0 comments on commit 55168ed

Please sign in to comment.