all.equal ignore row order, fix #4422 (#4423)

Rdatatable · Jun 2, 2020 · 738289d · 738289d
1 parent b1f73cf
commit 738289d
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 6 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -111,6 +111,8 @@ unit = "s")
 
 14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report.
 
+15. `all.equal(DT1, DT2, ignore.row.order=TRUE)` could return TRUE incorrectly in the presence of NAs, [#4422](https://github.com/Rdatatable/data.table/issues/4422).
+
 ## NOTES
 
 0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto.

diff --git a/R/setops.R b/R/setops.R
@@ -216,13 +216,12 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu
       tolerance = 0
     }
     jn.on = copy(names(target)) # default, possible altered later on
-    char.cols = vapply_1c(target,typeof)=="character"
-    if (!identical(tolerance, 0)) { # handling character columns only for tolerance!=0
-      if (all(char.cols)) {
-        msg = c(msg, "Both datasets have character columns only, together with ignore.row.order this force 'tolerance' argument to 0, for character columns it does not have effect")
+    dbl.cols = vapply_1c(target,typeof)=="double"
+    if (!identical(tolerance, 0)) {
+      if (!any(dbl.cols)) { # dbl.cols handles (removed) "all character columns" (char.cols) case as well
         tolerance = 0
-      } else if (any(char.cols)) { # character col cannot be the last one during rolling join
-        jn.on = jn.on[c(which(char.cols), which(!char.cols))]
+      } else {
+        jn.on = jn.on[c(which(!dbl.cols), which(dbl.cols))] # double column must be last for rolling join
       }
     }
     if (target_dup && current_dup) {

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -16881,3 +16881,8 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN))
 test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
 A = data.table(A=as.complex(rep(NA, 5)))
 test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
+
+# all.equal ignore row order improperly handle NAs #4422
+d1 = data.table(a=1:2, b=c(1L,NA))
+d2 = data.table(a=1:2, b=1:2)
+test(2139.1, all.equal(d1, d2, ignore.row.order=TRUE), "Dataset 'current' has rows not present in 'target'")