diff --git a/NEWS.md b/NEWS.md index e13ec71fa7..10ec47757c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -126,6 +126,46 @@ 22. C code was unified more in how failures to allocate memory (`malloc()`/`calloc()`) are handled, (#1115)[https://github.com/Rdatatable/data.table/issues/1115]. No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter. +22. Internal routine for finding sort order will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions. + +Functions `setindex` (and `setindexv`) will now compute groups' positions as well. `setindex()` also collects the extra statistics alluded to above. + +Finding sort order in other routines (for example subset `d2[id==1L]`) does not include those extra statistics so as not to impose a slowdown. + +```r +d2 = data.table(id=2:1, v2=1:2) +setindexv(d2, "id") +str(attr(attr(d2, "index"), "__id")) +# int [1:2] 2 1 +# - attr(*, "starts")= int [1:2] 1 2 +# - attr(*, "maxgrpn")= int 1 +# - attr(*, "anyna")= int 0 +# - attr(*, "anyinfnan")= int 0 +# - attr(*, "anynotascii")= int 0 +# - attr(*, "anynotutf8")= int 0 + +d2 = data.table(id=2:1, v2=1:2) +invisible(d2[id==1L]) +str(attr(attr(d2, "index"), "__id")) +# int [1:2] 2 1 +``` + +This feature also enables re-use of sort index during joins, in cases where one of the calls to find sort order is made from C code. + +```r +d1 = data.table(id=1:2, v1=1:2) +d2 = data.table(id=2:1, v2=1:2) +setindexv(d2, "id") +d1[d2, on="id", verbose=TRUE] +#... +#Starting bmerge ... +#forderReuseSorting: using existing index: __id +#forderReuseSorting: opt=2, took 0.000s +#... +``` + +This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing. + ## TRANSLATIONS 1. Fix a typo in a Mandarin translation of an error message that was hiding the actual error message, [#6172](https://github.com/Rdatatable/data.table/issues/6172). Thanks @trafficfan for the report and @MichaelChirico for the fix. diff --git a/R/bmerge.R b/R/bmerge.R index 881b8528ee..16d58fa539 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -118,9 +118,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } } - ## after all modifications of i, check if i has a proper key on all icols - io = identical(icols, head(chmatch(key(i), names(i)), length(icols))) - ## after all modifications of x, check if x has a proper key on all xcols. ## If not, calculate the order. Also for non-equi joins, the order must be calculated. non_equi = which.first(ops != 1L) # 1 is "==" operator @@ -180,7 +177,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } if (verbose) {last.started.at=proc.time();catf("Starting bmerge ...\n");flush.console()} - ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) + ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()} # TO DO: xo could be moved inside Cbmerge diff --git a/R/data.table.R b/R/data.table.R index 99d06fad75..cb32836b03 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3241,13 +3241,13 @@ is_constantish = function(q, check_singleton=FALSE) { if (is.null(idx)){ ## if nothing else helped, auto create a new index that can be used if (!getOption("datatable.auto.index")) return(NULL) - if (verbose) {catf("Creating new index '%s'\n", paste(names(i), collapse = "__"));flush.console()} - if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste(names(i), collapse = "__"));flush.console()} - setindexv(x, names(i)) - if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} - if (verbose) {catf("Optimized subsetting with index '%s'\n", paste(names(i), collapse = "__"));flush.console()} - idx = attr(attr(x, "index", exact=TRUE), paste("__", names(i), collapse = ""), exact=TRUE) idxCols = names(i) + if (verbose) {catf("Creating new index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} + if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste(idxCols, collapse = "__"));flush.console()} + idx = forderv(x, idxCols, sort=TRUE, retGrp=FALSE, reuseSorting=TRUE) + maybe_reset_index(x, idx, idxCols) ## forder can write index, but disabled for now, see #4386 + if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} + if (verbose) {catf("Optimized subsetting with index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} } if(!is.null(idxCols)){ setkeyv(i, idxCols) diff --git a/R/setkey.R b/R/setkey.R index b50cb8da90..f43ed39aa2 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -50,23 +50,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU miss = !(cols %chin% colnames(x)) if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss])) - ## determine, whether key is already present: - if (identical(key(x),cols)) { - if (!physical) { - ## create index as integer() because already sorted by those columns - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer()) - } - return(invisible(x)) - } else if(identical(head(key(x), length(cols)), cols)){ - if (!physical) { - ## create index as integer() because already sorted by those columns - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer()) - } else { - ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key. - setattr(x,"sorted",cols) - } + if (physical && identical(head(key(x), length(cols)), cols)){ ## for !physical we need to compute groups as well #4387 + ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key. + setattr(x,"sorted",cols) return(invisible(x)) } @@ -77,26 +63,20 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU } if (!is.character(cols) || length(cols)<1L) stopf("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov - newkey = paste(cols, collapse="__") - if (!any(indices(x) == newkey)) { - if (verbose) { - tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R - # suppress needed for tests 644 and 645 in verbose mode - catf("forder took %.03f sec\n", tt["user.self"]+tt["sys.self"]) - } else { - o = forderv(x, cols, sort=TRUE, retGrp=FALSE) - } + if (verbose) { + # we now also retGrp=TRUE #4387 for !physical + tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=!physical, reuseSorting=TRUE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R + # suppress needed for tests 644 and 645 in verbose mode + catf("forder took %.03f sec\n", tt["user.self"]+tt["sys.self"]) } else { - if (verbose) catf("setkey on columns %s using existing index '%s'\n", brackify(cols), newkey) - o = getindex(x, newkey) + o = forderv(x, cols, sort=TRUE, retGrp=!physical, reuseSorting=TRUE) } - if (!physical) { - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), o) + if (!physical) { # index COULD BE saved from C forderReuseSorting already, but disabled for now + maybe_reset_index(x, o, cols) return(invisible(x)) } - setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. if (length(o)) { + setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. Only when order changes. if (verbose) { last.started.at = proc.time() } .Call(Creorder,x,o) if (verbose) { catf("reorder took %s\n", timetaken(last.started.at)); flush.console() } @@ -124,7 +104,7 @@ getindex = function(x, name) { if (!is.null(ans) && (!is.integer(ans) || (length(ans)!=nrow(x) && length(ans)!=0L))) { stopf("Internal error: index '%s' exists but is invalid", name) # nocov } - ans + c(ans) ## drop starts and maxgrpn attributes } haskey = function(x) !is.null(key(x)) @@ -160,19 +140,24 @@ is.sorted = function(x, by=NULL) { # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1) } +maybe_reset_index = function(x, idx, cols) { + if (isTRUE(getOption("datatable.forder.auto.index"))) return(invisible()) + if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) + setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), idx) + invisible(x) +} + ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') -forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE) -{ +forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, order=1L, na.last=FALSE, reuseSorting=getOption("datatable.reuse.sorting", NA)) { if (is.atomic(x) || is.null(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), if (!missing(by) && !is.null(by)) stopf("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL } else { if (!length(x)) return(integer(0L)) # e.g. forderv(data.table(NULL)) and forderv(list()) return integer(0L)) by = colnamesInt(x, by, check_dups=FALSE) - if (length(order) == 1L) order = rep(order, length(by)) } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level - .Call(Cforder, x, by, retGrp, sort, order, na.last) # returns integer() if already sorted, regardless of sort=TRUE|FALSE + .Call(CforderReuseSorting, x, by, retGrp, retStats, sort, order, na.last, reuseSorting) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } forder = function(..., na.last=TRUE, decreasing=FALSE) @@ -209,7 +194,7 @@ forder = function(..., na.last=TRUE, decreasing=FALSE) data = eval(sub, parent.frame(), parent.frame()) } stopifnot(isTRUEorFALSE(decreasing)) - o = forderv(data, seq_along(data), sort=TRUE, retGrp=FALSE, order= if (decreasing) -asc else asc, na.last) + o = forderv(data, seq_along(data), retGrp=FALSE, retStats=FALSE, sort=TRUE, order=if (decreasing) -asc else asc, na.last=na.last) if (!length(o) && length(data)>=1L) o = seq_along(data[[1L]]) else o o } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c7f0833cf0..b90743e28e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6377,7 +6377,7 @@ test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9)) setkey(DT, NULL) setindex(DT, a) test(1419.62, setkey(DT, a, verbose=TRUE), data.table(a=c(1,1:9), aaa=c(1,1,2,2,1,2,2,2,1,2), key="a"), - output="setkey on columns [a] using existing index 'a'") # checks also that the prior index a is dropped (because y is keyed with no index) + output="using existing index: __a") # setkey picks correct index of multiple indexes (e.g. exact=TRUE is used in internals) DT = data.table(a = c(3,3,4,4,5,6,1,1,7,2), @@ -6392,7 +6392,7 @@ setindex(DT, aaa, a) setindex(DT, aaa) # this aaa not previous aaa_a should be used by setkey(DT,aaa); i.e. ensure no partial matching test(1419.65, allIndicesValid(DT), TRUE) test(1419.66, setkey(DT, aaa, verbose=TRUE), data.table(a=c(1,3,3,6,1,2,4,4,5,7), aaa=c(1,1,1,1,2,2,2,2,2,2), bbb=c(1,1,1,1,0,1,2,0,1,1), key="aaa"), - output="setkey on columns [aaa] using existing index 'aaa'") # checks that all indexes are dropped (aaa_a too) + output="using existing index: __aaa") # checks that all indexes are dropped (aaa_a too) # setnames updates secondary key DT = data.table(a=1:5,b=10:6) @@ -11757,7 +11757,7 @@ test(1775.3, capture.output(print(DT2, print.keys = TRUE)), setkey(DT2, a) setindexv(DT2, c("b","a")) test(1775.4, capture.output(print(DT2, print.keys = TRUE)), - c("Key: ", "Index: ", " a b", "1: 1 4", "2: 2 5", "3: 3 6")) + c("Key: ", "Indices: , ", " a b", "1: 1 4", "2: 2 5", "3: 3 6")) ## index 'b' is still good, so we keep it # dev regression #2285 cat("A B C\n1 2 3\n4 5 6", file=f<-tempfile()) @@ -12756,7 +12756,7 @@ test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L) DT = data.table(a = c(3, 2, 1, 2, 3), b = c(1, 2, 1, 1, 2)) setindexv(DT, list('a', c('a', 'b'))) test(1897.1, indices(DT), c("a", "a__b")) -test(1897.2, attributes(attr(DT, 'index')), +test(1897.2, lapply(attributes(attr(DT, 'index')), c), ## lapply(, c) to ensure no starts, maxgrpn attributes list(`__a` = c(3L, 2L, 4L, 1L, 5L), `__a__b` = c(3L, 4L, 2L, 1L, 5L))) @@ -12796,9 +12796,9 @@ test(1899.18, as.matrix(DT, rownames=TRUE, rownames.value=1:nrow(DT)), error="ro # index argument for fread, #2633 DT_str = c('a,b\n3,1\n2,2\n1,1\n2,1\n3,2') -test(1900.1, attributes(attr(fread(DT_str, index = 'a'), 'index')), +test(1900.1, lapply(attributes(attr(fread(DT_str, index = 'a'), 'index')), c), # lapply(, c) to ensure no starts, maxgrpn attributes list(`__a` = c(3L, 2L, 4L, 1L, 5L))) -test(1900.2, attributes(attr(fread(DT_str, index = list('a,b', c('b', 'a'), 'a')), 'index')), +test(1900.2, lapply(attributes(attr(fread(DT_str, index = list('a,b', c('b', 'a'), 'a')), 'index')), c), list(`__a__b` = c(3L, 4L, 2L, 1L, 5L), `__b__a` = c(3L, 4L, 1L, 2L, 5L), `__a` = c(3L, 2L, 4L, 1L, 5L))) @@ -12809,7 +12809,7 @@ test(1900.4, fread(DT_str, index = list('a', 1L)), # col.names applied before index test(1900.5, fread(DT_str, col.names = c('c', 'd'), index = 'a'), error = 'some columns are not in the data.table') -test(1900.6, attributes(attr(fread(DT_str, index = c('a', 'b')), 'index')), +test(1900.6, lapply(attributes(attr(fread(DT_str, index = c('a', 'b')), 'index')), c), list(`__a__b` = c(3L, 4L, 2L, 1L, 5L))) # . within bquote shouldn't be swapped to list, #1912 @@ -13307,7 +13307,7 @@ test(1953.4, melt.data.table(DT, id.vars = 'id', measure.vars = 'a'), # appearance order of two low-cardinality columns that were squashed in pr#3124 DT = data.table(A=INT(1,3,2,3,2), B=1:5) # respect groups in 1st column (3's and 2's) -test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L)) +test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L)) # skip values that are not present in old, #3030 DT <- data.table(a=1, b=2, d=3) @@ -13546,11 +13546,11 @@ test(1962.042, forderv(DT, na.last = c(TRUE, FALSE)), error='na.last must be lo test(1962.043, forderv(DT$a, by = 'a'), error='x is a single vector, non-NULL') test(1962.044, forderv(DT$a, order = 2L), error='Item 1 of order (ascending/descending) is 2. Must be +1 or -1') test(1962.045, forderv(DT$a, order = c(1L, -1L)), error='Input is an atomic vector (not a list of columns) but order= is not a length 1 integer') -test(1962.0461, forderv(DT, order = c(1L, -1L)), error="Either order= is not integer or its length (2) is different to by='s length (1)") +test(1962.0461, forderv(DT, order = c(1L, -1L)), error="length (2) is different to by='s length (1)") test(1962.0462, forderv(DT, order = 2L), error='Item 1 of order (ascending/descending) is 2. Must be +1 or -1') test(1962.0471, forderv(mean), error="'x' argument must be data.table compatible") test(1962.0472, forderv(DT, by=mean), error="argument specifying columns must be character or numeric") -test(1962.0473, forderv(NULL), error="DT is an empty list() of 0 columns") +test(1962.0473, forderv(NULL), error="DT is NULL") setDF(DT) test(1962.0481, forder(DT), 3:1) @@ -14292,12 +14292,12 @@ test(1993.1, foverlaps(xp, yp, nomatch = 0L, which=TRUE), data.table(xid=1L, yid test(1993.2, foverlaps(xp, yp, by.x=c("day", "year")), error="Some interval cols are of type POSIXct while others are not") # forderv NaN,Inf and Inf when at most 1 finite value is present, #3381. These broke in v1.12.0. They pass in v1.11.8. -test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L)) -test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L)) -test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L)) -test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L)) -test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L)) -test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L)) +test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) test(1994.7, data.table(A=c(-Inf,21,Inf),V=1:3)[,sum(V),by=A]$V1, 1:3) # 0 length items should not result in no-recycle error, #3386 @@ -18814,3 +18814,156 @@ y = data.table(a = 2:3, key="a") test(2274.31, merge(x,y, all.y=TRUE), data.table(a=structure(2:3, class=c("a", "integer")), key="a")) test(2274.32, rbind(x,y), error="Class attribute .* does not match with .*") test(2274.33, rbind(x,y, ignore.attr=TRUE), data.table(a=structure(c(1L, 2L, 2L, 3L), class=c("a", "integer")))) + +# lazy forder, #4386 +dd = data.table(a=1:2, b=2:1) +d = copy(dd) +test(2275.01, options=c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2275.02, options=c(datatable.verbose=TRUE), forderv(d, "b", reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +setkeyv(d, "b") +test(2275.03, options=c(datatable.verbose=TRUE), forderv(d, "b"), integer(), output="forder.*opt=1.*took") +test(2275.04, options=c(datatable.verbose=TRUE), forderv(d, "b", reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +d = copy(dd) +setindexv(d, "b") +test(2275.05, options=c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=2.*took") +test(2275.06, options=c(datatable.verbose=TRUE), forderv(d, "b", reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = copy(dd) +test(2275.11, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2275.12, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.13, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2275.14, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +setkeyv(d, c("a","b")) +test(2275.21, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") +test(2275.22, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.23, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2275.24, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +setkeyv(d, c("b","a")) +test(2275.25, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") +test(2275.26, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.27, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") +test(2275.28, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +d = copy(dd) +setindexv(d, c("a","b")) +test(2275.31, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2275.32, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.33, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2275.34, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = copy(dd) +setindexv(d, c("b","a")) +test(2275.35, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2275.36, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.37, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2275.38, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = copy(dd) +setindexv(d, list(c("a","b"), c("b","a"))) +test(2275.41, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2275.42, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.43, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2275.44, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = copy(dd) +setkeyv(d, c("a","b")) +setindexv(d, list(c("a","b"), c("b","a"))) +test(2275.51, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached +test(2275.52, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.53, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2275.54, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = copy(dd) +setindexv(d, list(c("a","b"), c("b","a"))) +test(2275.55, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") +test(2275.56, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +d = copy(dd) +setkeyv(d, c("a","b")) +setindexv(d, list(c("a","b"), c("b","a"))) +ab = structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) +ba = structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) +test(2275.60, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") # c(): strip attributes +test(2275.61, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") +test(2275.62, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE, reuseSorting=FALSE), ab, output="forder.*opt=0.*took") +test(2275.63, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") +test(2275.64, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE, reuseSorting=FALSE), ba, output="forder.*opt=0.*took") +test(2275.65, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via anyna index attribute +test(2275.66, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE, reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.67, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via anyna index attribute +test(2275.68, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE, reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.69, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2275.70, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, reuseSorting=FALSE), ab, output="forder.*opt=0.*took") +test(2275.71, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2275.72, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, reuseSorting=FALSE), ab, output="forder.*opt=0.*took") +test(2275.73, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") +test(2275.74, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L, reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.75, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") +test(2275.76, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L, reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.77, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") +test(2275.78, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.79, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") +test(2275.80, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.81, options=c(datatable.verbose=TRUE), forderv(1:2), integer(), output="forder.*opt=0.*took") +test(2275.82, options=c(datatable.verbose=TRUE), forderv(1:2, reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.83, options=c(datatable.verbose=TRUE), forderv(2:1), 2:1, output="forder.*opt=0.*took") +test(2275.84, options=c(datatable.verbose=TRUE), forderv(2:1, reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.851, forderv(2:1, retStats=NA), error="retStats must be TRUE or FALSE") +test(2275.852, forderv(2:1, retGrp=TRUE, retStats=FALSE), error="retStats must be TRUE whenever retGrp is TRUE") +ddd = data.table(v1=1:3, v2=c(1L,NA,3L), v3=c(3:2,NaN), v4=c(1:2,Inf), v5=c(-Inf,NA,3)) ## tests for NAs and na.last arg +d = copy(ddd) +test(2275.8530, options=c(datatable.optimize=Inf), {d[v1 == 1L]; indices(d)}, "v1") +test(2275.8531, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") +test(2275.854, options=c(datatable.verbose=TRUE), o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") +setattr(d, "index", setattr(integer(), "__v1", o)) +test(2275.855, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") +test(2275.856, options=c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2275.857, options=c(datatable.verbose=TRUE), forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") +d = copy(ddd) +test(2275.8580, options=c(datatable.optimize=Inf), {d[v1 == 1L]; indices(d)}, "v1") # _not_ setindex(d, v1), which will compute retGrp/retStats +test(2275.8581, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") +setindexv(d, "v2") +test(2275.859, options=c(datatable.verbose=TRUE), forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") +d = copy(ddd) +setkeyv(d, "v1") +setindexv(d, list("v2","v3","v4","v5",c("v1","v2"),c("v1","v3"),c("v2","v3"),c("v1","v4"),c("v1","v5"),c("v1","v4","v5"))) +test(2275.861, options=c(datatable.verbose=TRUE), forderv(d, "v1"), integer(), output="forder.*opt=1.*took") +test(2275.862, options=c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=-1.*took") ## cannot use key for na.last +setindexv(d, "v1") +test(2275.863, options=c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2275.864, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2275.865, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2275.866, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2275.867, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2275.868, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2275.869, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2275.870, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2275.871, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2275.872, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2275.873, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +d = fread(testDir("1680-fread-header-encoding.csv"), encoding="Latin-1") ## re-use some existing non utf8 data +anyEnc = function(x) unlist(attributes(forderv(x, retStats=TRUE))[c("anynotascii","anynotutf8")]) +test(2275.881, anyEnc(d), c(anynotascii=1L,anynotutf8=1L)) +test(2275.882, anyEnc(d[,-2L]), c(anynotascii=0L,anynotutf8=0L)) +test(2275.883, anyEnc(c("a","b","\u221A")), c(anynotascii=1L,anynotutf8=0L)) +d = copy(dd) +setindexv(d, "b") +test(2275.91, options=c(datatable.verbose=TRUE, datatable.use.index=FALSE), + forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2275.92, options=c(datatable.verbose=TRUE, datatable.use.index=FALSE), + forderv(d, "b", reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = data.table(x = 2:1) +test(2275.93, options=c(datatable.optimize=Inf), {d[x == 1L]; attr(attr(d, "index"), "__x")}, 2:1) +test(2275.94, options=c(datatable.verbose=TRUE), forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") +d = data.table(x = 2:1) +test(2275.95, options=list(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE, datatable.optimize=Inf), + d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") +test(2275.96, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1, retStats=1") +setindexv(d, NULL) +test(2275.971, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", retStats=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") +setindexv(d, NULL) +test(2275.972, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", retStats=TRUE, na.last=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") +setindexv(d, NULL) +test(2275.973, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(data.table(x=c(2:1,NA)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2275.974, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(data.table(x=c(2:1,NaN)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2275.975, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2275.99, forderv(data.table(a=1), reuseSorting=c(TRUE, TRUE)), error="reuseSorting must be") diff --git a/src/bmerge.c b/src/bmerge.c index 351baff283..108d828610 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -39,7 +39,11 @@ static Rboolean rollToNearest=FALSE; void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisgrp, int lowmax, int uppmax); -SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg) { +SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg) { + const bool verbose = GetVerbose(); + double tic=0.0, tic0=0.0; + if (verbose) + tic = omp_get_wtime(); int xN, iN, protecti=0; ctr=0; // needed for non-equi join case SEXP retFirstArg, retLengthArg, retIndexArg, allLen1Arg, allGrp1Arg; @@ -158,17 +162,11 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP allGrp1[0] = TRUE; protecti += 2; - // isorted arg - o = NULL; - if (!LOGICAL(isorted)[0]) { - SEXP order = PROTECT(allocVector(INTSXP, length(icolsArg))); - protecti++; - for (int j=0; j 1 && mult == ALL) { @@ -213,6 +215,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP Free(retLength); Free(retIndex); } + if (verbose) + Rprintf("bmerge: took %.3fs\n", omp_get_wtime()-tic); UNPROTECT(protecti); return (ans); } diff --git a/src/data.table.h b/src/data.table.h index 812ad5f975..a848ef0344 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -102,6 +102,10 @@ extern SEXP sym_index; extern SEXP sym_BY; extern SEXP sym_starts, char_starts; extern SEXP sym_maxgrpn; +extern SEXP sym_anyna; +extern SEXP sym_anyinfnan; +extern SEXP sym_anynotascii; +extern SEXP sym_anynotutf8; extern SEXP sym_colClassesAs; extern SEXP sym_verbose; extern SEXP SelfRefSymbol; @@ -138,7 +142,8 @@ int checkOverAlloc(SEXP x); // forder.c int StrCmp(SEXP x, SEXP y); uint64_t dtwiddle(double x); -SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); +SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); +SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP reuseSortingArg); // reuseSorting wrapper to forder int getNumericRounding_C(void); // reorder.c @@ -187,7 +192,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP on, SEXP verbose); // bmerge.c -SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, +SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg); diff --git a/src/forder.c b/src/forder.c index 7226f7e454..564f55ec30 100644 --- a/src/forder.c +++ b/src/forder.c @@ -32,6 +32,7 @@ static int nth = 1; // number of threads to use, throttled by default; used by cleanup() to ensure no mismatch in getDTthreads() calls static bool retgrp = true; // return group sizes as well as the ordering vector? If so then use gs, gsalloc and gsn : +static bool retstats = true; // return extra flags for any NA, NaN, -Inf, +Inf, non-ASCII, non-UTF8 static int nrow = 0; // used as group size stack allocation limit (when all groups are 1 row) static int *gs = NULL; // gs = final groupsizes e.g. 23,12,87,2,1,34,... static int gs_alloc = 0; // allocated size of gs @@ -277,11 +278,11 @@ static void cradix(SEXP *x, int n) free(cradix_xtmp); cradix_xtmp=NULL; } -static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count) +static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8) // group numbers are left in truelength to be fetched by WRITE_KEY { int na_count=0; - bool anyneedutf8=false; + bool anynotascii=false, anynotutf8=false; if (ustr_n!=0) STOP(_("Internal error: ustr isn't empty when starting range_str: ustr_n=%d, ustr_alloc=%d"), ustr_n, ustr_alloc); // # nocov if (ustr_maxlen!=0) STOP(_("Internal error: ustr_maxlen isn't 0 when starting range_str")); // # nocov // savetl_init() has already been called at the start of forder @@ -308,16 +309,24 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max ustr[ustr_n++] = s; SET_TRUELENGTH(s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s); - if (!anyneedutf8 && NEED2UTF8(s)) anyneedutf8=true; + if (!anynotutf8 && // even if anynotascii we still want to know if anynotutf8, and anynotutf8 implies anynotascii already + !IS_ASCII(s)) { // anynotutf8 implies anynotascii and IS_ASCII will be cheaper than IS_UTF8, so start with this one + if (!anynotascii) + anynotascii=true; + if (!IS_UTF8(s)) + anynotutf8=true; + } } } *out_na_count = na_count; + *out_anynotascii = anynotascii; + *out_anynotutf8 = anynotutf8; if (ustr_n==0) { // all na *out_min = 0; *out_max = 0; return; } - if (anyneedutf8) { + if (anynotutf8) { SEXP ustr2 = PROTECT(allocVector(STRSXP, ustr_n)); for (int i=0; i0) + any_na = 1; // may be written multiple times, for each column that has NA, but thats fine + if (infnan_count>0) + any_infnan = 1; + if (anynotascii) + any_notascii = 1; + if (anynotutf8) + any_notutf8 = 1; if (na_count==nrow || (min>0 && min==max && na_count==0 && infnan_count==0)) { // all same value; skip column as nothing to do; [min,max] is just of finite values (excludes +Inf,-Inf,NaN and NA) if (na_count==nrow && nalast==-1) { for (int i=0; i0 || attr(idx, "anyinfnan")>0 +bool idxAnyNF(SEXP idx) { + return INTEGER(getAttrib(idx, sym_anyna))[0]>0 || INTEGER(getAttrib(idx, sym_anyinfnan))[0]>0; +} + +// forder, re-use existing key or index if possible, otherwise call forder +SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP reuseSortingArg) { + const bool verbose = GetVerbose(); + int protecti = 0; + double tic=0.0; + if (verbose) + tic = omp_get_wtime(); + if (isNull(DT)) + error("DT is NULL"); + if (!IS_TRUE_OR_FALSE(retGrpArg)) + error("retGrp must be TRUE or FALSE"); + bool retGrp = (bool)LOGICAL(retGrpArg)[0]; + if (!IS_TRUE_OR_FALSE(retStatsArg)) + error("retStats must be TRUE or FALSE"); + bool retStats = (bool)LOGICAL(retStatsArg)[0]; + if (!retStats && retGrp) + error("retStats must be TRUE whenever retGrp is TRUE"); // retStats doesnt cost anything and it will be much easier to optimize use of index + if (!IS_TRUE_OR_FALSE(sortGroupsArg)) + error("sort must be TRUE or FALSE"); + bool sortGroups = (bool)LOGICAL(sortGroupsArg)[0]; + if (!isLogical(naArg) || LENGTH(naArg) != 1) + error("na.last must be logical TRUE, FALSE or NA of length 1"); + bool na = (bool)LOGICAL(naArg)[0]; + if (!isInteger(ascArg)) + error("order must be integer"); // # nocov # coerced to int in R + if (!isLogical(reuseSortingArg) || LENGTH(reuseSortingArg) != 1) + error("reuseSorting must be logical TRUE, FALSE or NA of length 1"); + int reuseSorting = LOGICAL(reuseSortingArg)[0]; + if (!length(DT)) + return allocVector(INTSXP, 0); + int opt = -1; // -1=unknown, 0=none, 1=keyOpt, 2=idxOpt + if (reuseSorting==NA_LOGICAL) { + if (INHERITS(DT, char_datatable) && // unnamed list should not be optimized + sortGroups && + all1(ascArg)) { // could ascArg=-1 be handled by a rev()? + opt = -1; + } else { + if (verbose) + Rprintf("forderReuseSorting: opt not possible: is.data.table(DT)=%d, sortGroups=%d, all1(ascArg)=%d\n", INHERITS(DT,char_datatable), sortGroups, all1(ascArg)); + opt = 0; + } + } else if (reuseSorting) { + if (!INHERITS(DT,char_datatable)) + error("internal error: reuseSorting set to TRUE but DT is not a data.table"); // # nocov + if (!sortGroups) + error("internal error: reuseSorting set to TRUE but sort is FALSE"); // # nocov + if (!all1(ascArg)) + error("internal error: reuseSorting set to TRUE but order is not all 1"); // # nocov + opt = -1; + } else if (!reuseSorting) { + opt = 0; + } + SEXP ans = R_NilValue; + if (opt == -1 && !na && !retGrp && colsKeyHead(DT, by)) { + opt = 1; // keyOpt + ans = PROTECT(allocVector(INTSXP, 0)); protecti++; + if (verbose) + Rprintf("forderReuseSorting: using key: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } + if (opt == -1 && GetUseIndex()) { + SEXP idx = getIndex(DT, by); + if (!isNull(idx)) { + bool hasStats = !isNull(getAttrib(idx, sym_anyna)); + if (!na || // na.last=FALSE + (hasStats && !idxAnyNF(idx))) { // na.last=TRUE && !anyNA + bool hasGrp = !isNull(getAttrib(idx, sym_starts)); + if (hasGrp && !hasStats) + error("internal error: index has 'starts' attribute but not 'anyna', please report to issue tracker"); // # nocov + if (hasGrp==retGrp && hasStats==retStats) { + opt = 2; // idxOpt + } else if ( + (hasGrp && !retGrp && !(!hasStats && retStats)) || // !hasStats should never happen when hasGrp + (hasStats && !retStats && !(!hasGrp && retGrp)) + ) { + // shallow_duplicate is faster than copyAsPlain, but shallow_duplicate is AFAIK good for VECSXP, not for INTSXP + // it is still the bottleneck in this opt, it is better to call retGrp=TRUE and just not use those extra attributes + // can we do better here? real shallow for INTSXP? If we could just re-point data pointer... like we do for DT columns + // SEXP new; INTEGER(new) = INTEGER(idx); setAttrib(new, ..., R_NilValue) + idx = shallow_duplicate(idx); + if (hasGrp && !retGrp) { + setAttrib(idx, sym_starts, R_NilValue); + setAttrib(idx, sym_maxgrpn, R_NilValue); + } + if (hasStats && !retStats) { + setAttrib(idx, sym_anyna, R_NilValue); + setAttrib(idx, sym_anyinfnan, R_NilValue); + setAttrib(idx, sym_anynotascii, R_NilValue); + setAttrib(idx, sym_anynotutf8, R_NilValue); + } + opt = 2; // idxOpt but need to drop groups or stats + } else if (!hasGrp && retGrp && !hasStats && retStats) { + if (verbose) + Rprintf("forderReuseSorting: index found but not for retGrp and retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else if (!hasGrp && retGrp) { + if (verbose) + Rprintf("forderReuseSorting: index found but not for retGrp: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else if (!hasStats && retStats) { + if (verbose) + Rprintf("forderReuseSorting: index found but not for retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else { + error("internal error: reuseSorting forder index optimization unhandled branch of retGrp-retStats, please report to issue tracker"); // # nocov + } + } else { + if (!hasStats) { + if (verbose) + Rprintf("forderReuseSorting: index found but na.last=TRUE and no stats available: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else if (idxAnyNF(idx)) { + if (verbose) + Rprintf("forderReuseSorting: index found but na.last=TRUE and NAs present: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else { + error("internal error: reuseSorting forder index optimization unhandled branch of last.na=T, please report to issue tracker"); // # nocov + } + } + if (opt == 2) { + ans = idx; + if (verbose) + Rprintf("forderReuseSorting: using existing index: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } + } + } + if (opt < 1) { + ans = PROTECT(forder(DT, by, retGrpArg, retStatsArg, sortGroupsArg, ascArg, naArg)); protecti++; + if (opt == -1 && // opt==0 means that arguments (sort, asc) were not of type index, or reuseSorting=FALSE + (!na || (retStats && !idxAnyNF(ans))) && // lets create index even if na.last=T used but no NAs detected! + GetUseIndex() && + GetAutoIndex()) { // disabled by default, use datatable.forder.auto.index=T to enable, do not export/document, use for debugging only + putIndex(DT, by, ans); + if (verbose) + Rprintf("forderReuseSorting: setting index (retGrp=%d, retStats=%d) on DT: %s\n", retGrp, retStats, CHAR(STRING_ELT(idxName(DT, by), 0))); + } + } + if (verbose) + Rprintf("forderReuseSorting: opt=%d, took %.3fs\n", opt, omp_get_wtime()-tic); + UNPROTECT(protecti); + return ans; +} diff --git a/src/init.c b/src/init.c index 48046b8d6a..5ab53d0913 100644 --- a/src/init.c +++ b/src/init.c @@ -29,6 +29,10 @@ SEXP sym_index; SEXP sym_BY; SEXP sym_starts, char_starts; SEXP sym_maxgrpn; +SEXP sym_anyna; +SEXP sym_anyinfnan; +SEXP sym_anynotascii; +SEXP sym_anynotutf8; SEXP sym_colClassesAs; SEXP sym_verbose; SEXP SelfRefSymbol; @@ -73,6 +77,7 @@ R_CallMethodDef callMethods[] = { {"Cfcast", (DL_FUNC) &fcast, -1}, {"Cuniqlist", (DL_FUNC) &uniqlist, -1}, {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1}, +{"CforderReuseSorting", (DL_FUNC) &forderReuseSorting, -1}, {"Cforder", (DL_FUNC) &forder, -1}, {"Cissorted", (DL_FUNC) &issorted, -1}, {"Cgforce", (DL_FUNC) &gforce, -1}, @@ -279,6 +284,10 @@ void attribute_visible R_init_data_table(DllInfo *info) sym_index = install("index"); sym_BY = install(".BY"); sym_maxgrpn = install("maxgrpn"); + sym_anyna = install("anyna"); + sym_anyinfnan = install("anyinfnan"); + sym_anynotascii = install("anynotascii"); + sym_anynotutf8 = install("anynotutf8"); sym_colClassesAs = install("colClassesAs"); sym_verbose = install("datatable.verbose"); SelfRefSymbol = install(".internal.selfref");