Rdatatable · mattdowle · May 28, 2019 · May 22, 2019 · May 22, 2019 · May 22, 2019
@@ -39,7 +39,8 @@ Authors@R: c(
   person("Michael","Quinn",        role="ctb"),
   person("@javrucebo","",          role="ctb"),
   person("@marc-outins","",        role="ctb"),
-  person("Roy","Storey",           role="ctb"))
+  person("Roy","Storey",           role="ctb"),
+  person("Manish","Saraswat",      role="ctb"))
 Depends: R (>= 3.1.0)
 Imports: methods
 Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml

@@ -122,6 +122,8 @@
 
 16. `rbind` and `rbindlist` of an item containing an ordered factor with levels containing an `NA` (as opposed to an NA integer) could segfault, [#3601](https://github.com/Rdatatable/data.table/issues/3601). This was a a regression in v1.12.2. Thanks to Damian Betebenner for reporting.
 
+17. `setkey` now checks for existing index before doing `forder` [#3582](https://github.com/Rdatatable/data.table/pull/3582/). Executes `forder` only if the index does not exist. Thanks to @MichaelChirico for reporting and @saraswatmks for the PR.
+
 #### NOTES
 
 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.

@@ -86,12 +86,27 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
     if (!typeof(.xi) %chin% c("integer","logical","character","double")) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.")
   }
   if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov
-  if (verbose) {
-    tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE)))  # system.time does a gc, so we don't want this always on, until refcnt is on by default in R
-    # suppress needed for tests 644 and 645 in verbose mode
-    cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n")
+
+  # get existing index name if any
+  newkey = paste0(cols, collapse="__")
+
+  # forder only if index is not present
+  if (!any(indices(x) == newkey)){
+      if (verbose) {
+          tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE)))  # system.time does a gc, so we don't want this always on, until refcnt is on by default in R
+          # suppress needed for tests 644 and 645 in verbose mode
+          cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n")
+      } else {
+          o <- forderv(x, cols, sort=TRUE, retGrp=FALSE)
+      }
   } else {
-    o = forderv(x, cols, sort=TRUE, retGrp=FALSE)
+      # find the name of matching index
+      if (verbose){
+          cat("using existing index for", newkey, "\n")
+          o <- attr(attributes(x)$index, which=newkey, exact = TRUE)
+      } else {
+          o <- attr(attributes(x)$index, which=newkey, exact = TRUE)
+      }
   }
   if (!physical) {
     if (is.null(attr(x,"index",exact=TRUE))) setattr(x, "index", integer())

@@ -5668,13 +5668,13 @@ test(1376.06, indices(DT), c("b","a"))  # 2 secondary keys of single columns
 test(1376.07, DT[a==7L,verbose=TRUE], DT[7L], output="Optimized subsetting with index 'a'")
 setkey(DT,b)
 test(1376.08, indices(DT), NULL)
-test(1376.09, list(DT[a==2L], indices(DT)), list(DT[9L],"a"))  # create indices for next test
+test(1376.09, list(DT[a==2L], indices(DT)), list(DT[2L],"a"))  # create indices for next test
 setindex(DT,NULL)
 test(1376.10, list(key(DT), indices(DT)), list("b", NULL))
 options(datatable.auto.index = FALSE)
-test(1376.11, list(DT[a==2L], indices(DT)), list(DT[9L],NULL))
+test(1376.11, list(DT[a==2L], indices(DT)), list(DT[2L],NULL))
 options(datatable.auto.index = TRUE)
-test(1376.12, list(DT[a==2L], indices(DT)), list(DT[9L],"a"))
+test(1376.12, list(DT[a==2L], indices(DT)), list(DT[2L],"a"))
 
 # When i is FALSE and a column is being added by reference, for consistency with cases when i is not FALSE
 # we should still add the column. But we need to know what type it should be, so the user supplied RHS of :=
@@ -5934,7 +5934,7 @@ test(1419.18, key(thisDT), "x1")
 ## testing secondary index retainment on assign (#2372)
 
 allIndicesValid <- function(DT){
-  ## checks that the order of all indices is correct
+  ## checks that the order of all indices are correct
   for(idx in seq_along(indices(DT))){
     index <- attr(attr(DT, "index"), paste0("__", indices(DT)[idx], collapse = ""))
     if(!length(index)) index <- seq_len(nrow(DT))
@@ -6031,6 +6031,33 @@ thisDT <- copy(DT)[, c("aaa", "b") := 2]
 test(1419.58, indices(thisDT), c("a", "ab"))
 test(1419.59, allIndicesValid(thisDT), TRUE)
 
+## setkey on same col as existing index, #2889
+DT <- data.table(a =c(4,1,3,9,2,1,8,7,6,5),
+                 aaa = c(1,1,2,2,2,1,1,2,2,2))
+setindex(DT, a)
+test(1419.60, allIndicesValid(DT), TRUE)
+setindex(DT, NULL)
+setkey(DT, a)
+test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9))
+setkey(DT, NULL)
+setindex(DT, a)
+test(1419.62, setkey(DT, a, verbose=TRUE), data.table(a=c(1,1:9), aaa=c(1,1,2,2,1,2,2,2,1,2), key="a"),
+              output="using existing index for a")  # checks also that the prior index a is dropped (because y is keyed with no index)
+
+# setkey picks correct index of multiple indexes (e.g. exact=TRUE is used in internals)
+DT = data.table(a =   c(3,3,4,4,5,6,1,1,7,2),
+                aaa = c(1,1,2,2,2,1,1,2,2,2),
+                bbb = c(1,1,2,0,1,1,1,0,1,1))
+setkey(DT, a)
+test(1419.63, DT$a, c(1,1,2,3,3,4,4,5,6,7))
+setkey(DT, NULL)
+test(1419.64, setkey(DT, a, verbose=TRUE), output="forder took")
+setkey(DT, NULL)
+setindex(DT, aaa, a)
+setindex(DT, aaa)   # this aaa not previous aaa_a should be used by setkey(DT,aaa); i.e. ensure no partial matching
+test(1419.65, allIndicesValid(DT), TRUE)
+test(1419.66, setkey(DT, aaa, verbose=TRUE), data.table(a=c(1,3,3,6,1,2,4,4,5,7), aaa=c(1,1,1,1,2,2,2,2,2,2), bbb=c(1,1,1,1,0,1,2,0,1,1), key="aaa"),
+              output="using existing index for aaa")  # checks that all indexes are dropped (aaa_a too)
 
 # setnames updates secondary key
 DT = data.table(a=1:5,b=10:6)