Revert, sync with Rdatatable, modify SHAs

Anirban166 · Oct 8, 2024 · 7b0b68d · 7b0b68d
1 parent 5750f5d
commit 7b0b68d
Showing 1 changed file with 110 additions and 108 deletions.
diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R
@@ -1,68 +1,48 @@
-# Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745
+# Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745 which is where the issue was reported.
 # https://github.com/Rdatatable/data.table/pull/6107 fixed performance across 3 ways to specify a column as Date, and we test each individually.
 extra.args.6107 <- c(
   "colClasses=list(Date='date')",
   "colClasses='Date'",
-  "select=list(Date='date')"
-)
-
+  "select=list(Date='date')")
 extra.test.list <- list()
-
 for (extra.arg in extra.args.6107){
   this.test <- atime::atime_test(
     setup = {
       set.seed(1)
-      DT = data.table(date = as.Date(sample(20000, N, replace = TRUE)))
+      DT = data.table(date=.Date(sample(20000, N, replace=TRUE)))
       tmp_csv = tempfile()
       fwrite(DT, tmp_csv)
     },
-    Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent commit SHA
-    Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2"   # Merge commit SHA
-  )
-
-  # Define the expression as a language object using str2lang (acceptable here since it's a direct expression)
-  this.test$expr <- str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
-
-  # Assign a unique name to each test case
-  test_name <- sprintf("fread(%s) improved in #6107", extra.arg)
-  extra.test.list[[test_name]] <- this.test
+    Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
+    Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
+  this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
+  extra.test.list[[sprintf("fread(%s) improved in #6107", extra.arg)]] <- this.test
 }
 
-# Define retGrp_values as logicals
-retGrp_values <- c(TRUE, FALSE)
-
-# Loop through logical retGrp_setup and retGrp_expr
-for(retGrp_setup in retGrp_values){
-  for(retGrp_expr in retGrp_values){
-    test.name <- sprintf("forderv(retGrp=%s-%s) improved in #4386", retGrp_setup, retGrp_expr)
-
-    extra.test.list[[test.name]] <- list(
-      setup = quote({
-        options(datatable.forder.auto.index = TRUE)
-        set.seed(1)
-        dt <- data.table(index = sample(N), values = sample(N))
-        index.list <- list()
-        for(retGrp in retGrp_values){
-          # Pass logical retGrp directly
-          data.table:::forderv(dt, "index", retGrp = retGrp)
-          # Use as.character(retGrp) to store in the list
-          index.list[[as.character(retGrp)]] <- attr(dt, "index")
-        }
-      }),
-      expr = substitute({
-        # Retrieve the index using as.character to match the setup
-        setattr(dt, "index", index.list[[as.character(retGrp_setup)]])
-        # Pass logical retGrp_expr directly
-        data.table:::forderv(dt, "index", retGrp = retGrp_expr)
-      }, list(
-        retGrp_setup = retGrp_setup,
-        retGrp_expr = retGrp_expr
-      )),
-      Slow = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent commit SHA
-      Fast = "1a84514f6d20ff1f9cc614ea9b92ccdee5541506"   # Merge commit SHA
-    )
-  }
-}
+# Test case adapted from https://github.com/Rdatatable/data.table/pull/4386#issue-602528139 which is where the performance was improved.
+for(retGrp_chr in c("T","F"))extra.test.list[[sprintf(
+  "forderv(retGrp=%s) improved in #4386", retGrp_chr
+)]] <- list(
+  setup = quote({
+    dt <- data.table(group = rep(1:2, l=N))
+  }),
+  expr = substitute({
+    old.opt <- options(datatable.forder.auto.index = TRUE) # required for test, un-documented, comments in forder.c say it is for debugging only.
+    data.table:::forderv(dt, "group", retGrp = RETGRP)
+    options(old.opt) # so the option does not affect other tests.
+  }, list(RETGRP=eval(str2lang(retGrp_chr)))),
+  ## From ?bench::mark, "Each expression will always run at least twice,
+  ## once to measure the memory allocation and store results
+  ## and one or more times to measure timing."
+  ## So for atime(times=10) that means 11 times total.
+  ## First time for memory allocation measurement,
+  ## (also sets the index of dt in this example),
+  ## then 10 more times for time measurement.
+  ## Timings should be constant if the cached index is used (Fast),
+  ## and (log-)linear if the index is re-computed (Slow).
+  Slow = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of merge commit
+  Fast = "1a84514f6d20ff1f9cc614ea9b92ccdee5541506" # Merge commit SHA
+)
 
 # A list of performance tests.
 #
@@ -87,7 +67,29 @@ for(retGrp_setup in retGrp_values){
 test.list <- atime::atime_test_list(
   # Common N and pkg.edit.fun are defined here, and inherited in all test cases below which do not re-define them.
   N = as.integer(10^seq(1, 7, by=0.25)),
-  # pkg.edit.fun remains unchanged
+  # A function to customize R package metadata and source files to facilitate version-specific installation and testing.
+  #
+  # This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R)
+  # to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package.
+  # It appends a SHA1 hash to the package name (PKG.SHA), ensuring each version can be installed and tested separately.
+  #
+  # @param old.Package Current name of the package.
+  # @param new.Package New name of the package, including a SHA hash.
+  # @param sha SHA1 hash used for differentiating versions.
+  # @param new.pkg.path Path to the package files.
+  #
+  # @details
+  # The function modifies:
+  # - DESCRIPTION, updating the package name.
+  # - Makevars, customizing the shared object file name and adjusting the build settings.
+  # - R/onLoad.R, adapting custom version checking for package loading operations.
+  # - NAMESPACE, changing namespace settings for dynamic linking.
+  #
+  # @examples
+  # pkg.edit.fun("data.table", "data.table.some_SHA1_hash", "some_SHA1_hash", "/path/to/data.table")
+  #
+  # @return None (performs in-place file modifications)
+  # @note This setup is typically unnecessary for most packages but essential for data.table due to its unique configuration.
   pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) {
     pkg_find_replace <- function(glob, FIND, REPLACE) {
       atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
@@ -98,77 +100,76 @@ test.list <- atime::atime_test_list(
     pkg_find_replace(
       "DESCRIPTION",
       paste0("Package:\\s+", old.Package),
-      paste("Package:", new.Package)
-    )
+      paste("Package:", new.Package))
     pkg_find_replace(
       file.path("src", "Makevars.*in"),
       Package_regex,
-      new.Package_
-    )
+      new.Package_)
     pkg_find_replace(
       file.path("R", "onLoad.R"),
       Package_regex,
-      new.Package_
-    )
+      new.Package_)
     pkg_find_replace(
       file.path("R", "onLoad.R"),
       sprintf('packageVersion\\("%s"\\)', old.Package),
-      sprintf('packageVersion\\("%s"\\)', new.Package)
-    )
+      sprintf('packageVersion\\("%s"\\)', new.Package))
     pkg_find_replace(
       file.path("src", "init.c"),
       paste0("R_init_", Package_regex),
-      paste0("R_init_", gsub("[.]", "_", new.Package_))
-    )
+      paste0("R_init_", gsub("[.]", "_", new.Package_)))
     pkg_find_replace(
       "NAMESPACE",
       sprintf('useDynLib\\("?%s"?', Package_regex),
-      paste0('useDynLib(', new.Package_)
-    )
+      paste0('useDynLib(', new.Package_))
   },
 
-  # Existing tests
+  # Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
   "shallow regression fixed in #4440" = atime::atime_test(
     setup = {
       set.seed(1L)
       dt <- data.table(a = sample.int(N))
       setindexv(dt, "a")
     },
-    expr = quote(data.table:::shallow(dt)),
-    Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4",
-    Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"
-  ),
+    expr = data.table:::shallow(dt),
+    # Before = "", This needs to be updated later as there are two issues here: A) The source of regression (or the particular commit that led to it) is not clear; B) Older versions of data.table are having problems when being installed in this manner (This includes commits from before March 20 2020, when the issue that discovered or first mentioned the regression was created)
+    Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression
+    Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440)
 
+  # Test based on https://github.com/Rdatatable/data.table/issues/5424
+  # Performance regression introduced from a commit in https://github.com/Rdatatable/data.table/pull/4491
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/5463#issue-1373642456 which is the fix PR.
   "memrecycle regression fixed in #5463" = atime::atime_test(
     setup = {
-      bigN <- N * 100
+      bigN <- N*100
       set.seed(2L)
       dt <- data.table(
         g = sample(seq_len(N), bigN, TRUE),
         x = runif(bigN),
-        key = "g"
-      )
+        key = "g")
       dt_mod <- copy(dt)
     },
-    expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)),
-    Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854",
-    Regression = "e793f53466d99acee1589910c69c1a2c6586b95d",
-    Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"
-  ),
+    expr = data.table:::`[.data.table`(dt_mod, , N := .N, by = g),
+    Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
+    Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
+    Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5463/commits) that fixed the regression
 
+  # Issue reported in https://github.com/Rdatatable/data.table/issues/5426
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/5427#issue-1323678063 which is the fix PR.
   "setDT improved in #5427" = atime::atime_test(
     setup = {
       L <- replicate(N, 1, simplify = FALSE)
       setDT(L)
     },
-    expr = quote({
+    expr = {
       data.table:::setattr(L, "class", NULL)
       data.table:::setDT(L)
-    }),
-    Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12",
-    Fast = "af48a805e7a5026a0c2d0a7fd9b587fea5cfa3c4"
-  ),
+    },
+    Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
+    Fast = "af48a805e7a5026a0c2d0a7fd9b587fea5cfa3c4"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
 
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/4200#issuecomment-645980224 which is where the issue was reported.
+  # Fixed in https://github.com/Rdatatable/data.table/pull/4558
   "DT[by] fixed in #4558" = atime::atime_test(
     setup = {
       d <- data.table(
@@ -177,44 +178,48 @@ test.list <- atime::atime_test_list(
         v2 = sample(5L, N, TRUE)
       )
     },
-    expr = quote(data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id)),
-    Before = "7a9eaf62ede487625200981018d8692be8c6f134",
-    Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d",
-    Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"
-  ),
+    expr = data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id),
+    Before = "7a9eaf62ede487625200981018d8692be8c6f134", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/515de90a6068911a148e54343a3503043b8bb87c) in the PR (https://github.com/Rdatatable/data.table/pull/4164/commits) that introduced the regression
+    Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
+    Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
 
+  # Issue with sorting again when already sorted, as reported in https://github.com/Rdatatable/data.table/issues/4498
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/4501#issue-625311918 which is the fix PR.
   "DT[,.SD] improved in #4501" = atime::atime_test(
     setup = {
       set.seed(1)
       L = as.data.table(as.character(rnorm(N, 1, 0.5)))
       setkey(L, V1)
     },
-    expr = quote(data.table:::`[.data.table`(L, , .SD)),
-    Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461",
-    Slow = "3ca83738d70d5597d9e168077f3768e32569c790"
-    # Removed 'Slower' label as it's not standard
-  ),
+    ## New DT can safely retain key.
+    expr = data.table:::`[.data.table`(L, , .SD),
+    Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
+    Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
+    Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
 
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/6286#issue-2412141289 which is where the issue was reported.
+  # Fixed in https://github.com/Rdatatable/data.table/pull/6296
   "DT[by,verbose=TRUE] improved in #6296" = atime::atime_test(
     setup = {
       dt = data.table(a = 1:N)
       dt_mod <- copy(dt)
     },
-    expr = quote(data.table:::`[.data.table`(dt_mod, , 1, by = a, verbose = TRUE)),
-    Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e",
-    Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"
-  ),
+    expr = data.table:::`[.data.table`(dt_mod, , 1, by = a, verbose = TRUE),
+    Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue
+    Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue
 
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/5492#issue-1416598382 which is where the issue was reported,
+  # and from https://github.com/Rdatatable/data.table/pull/5493#issue-1416656788 which is the fix PR.
   "transform improved in #5493" = atime::atime_test(
     setup = {
       df <- data.frame(x = runif(N))
       dt <- as.data.table(df)
     },
-    expr = quote(data.table:::transform.data.table(dt, y = round(x))),
-    Slow = "0895fa247afcf6b38044bd5f56c0d209691ddb31",
-    Fast = "2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7"
-  ),
+    expr = data.table:::transform.data.table(dt, y = round(x)),
+    Slow = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/93ce3ce1373bf733ebd2036e2883d2ffe377ab58) in the PR (https://github.com/Rdatatable/data.table/pull/5493/commits) that fixes the issue
+    Fast = "2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5493) that fixes the issue
 
+  # Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns."
   "melt improved in #5054" = atime::atime_test(
     setup = {
       DT <- as.data.table(as.list(1:N))
@@ -224,12 +229,9 @@ test.list <- atime::atime_test_list(
         x
       })  
     },
-    expr = quote(data.table:::melt(DT, measure.vars = measure.vars)),
-    Slow = "fd24a3105953f7785ea7414678ed8e04524e6955",
-    Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"
-  ),
-
-  # Append the extra tests (fread and forder)
-  tests = extra.test.list
-)
+    expr = data.table:::melt(DT, measure.vars = measure.vars),
+    Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
+    Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
+
+  tests=extra.test.list)
 # nolint end: undesirable_operator_linter.