From d0fd09482f336943a56afac029e5e6b0911cccff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Sun, 1 Mar 2020 20:52:09 +0100 Subject: [PATCH 01/10] Fixed crash when attempting to join on character(0) --- R/data.table.R | 2 +- R/merge.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 9292ee940b..c517c818c2 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3031,7 +3031,7 @@ isReallyReal = function(x) { onsub = as.call(c(quote(c), onsub)) } on = eval(onsub, parent.frame(2L), parent.frame(2L)) - if (!is.character(on)) + if (length(on) == 0L || !is.character(on)) stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") ## extract the operators and potential variable names from 'on'. ## split at backticks to take care about variable names like `col1<=`. diff --git a/R/merge.R b/R/merge.R index 31f322fce5..13c6a9d055 100644 --- a/R/merge.R +++ b/R/merge.R @@ -21,7 +21,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (!missing(by) && !missing(by.x)) warning("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.") if (!is.null(by.x)) { - if ( !is.character(by.x) || !is.character(by.y)) + if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y)) stop("A non-empty vector of column names are required for `by.x` and `by.y`.") if (!all(by.x %chin% names(x))) stop("Elements listed in `by.x` must be valid column names in x.") From 2e185f6d8bec9b26e46f3b659d7c7157243cc024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Sun, 1 Mar 2020 20:52:42 +0100 Subject: [PATCH 02/10] Minor grammar fix in error message --- R/merge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/merge.R b/R/merge.R index 13c6a9d055..fe3bdb4549 100644 --- a/R/merge.R +++ b/R/merge.R @@ -22,7 +22,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL warning("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.") if (!is.null(by.x)) { if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y)) - stop("A non-empty vector of column names are required for `by.x` and `by.y`.") + stop("A non-empty vector of column names is required for `by.x` and `by.y`.") if (!all(by.x %chin% names(x))) stop("Elements listed in `by.x` must be valid column names in x.") if (!all(by.y %chin% names(y))) From a3e0e62ad0360dccc0db83e8e5db614f210aac57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Sun, 1 Mar 2020 20:53:10 +0100 Subject: [PATCH 03/10] Atted test for crash when attempting to join on character(0) --- inst/tests/tests.Rraw | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7cc6819e8f..85a71baa04 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16846,3 +16846,10 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN)) test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) A = data.table(A=as.complex(rep(NA, 5))) test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) + +# Attempting to join on character(0) shouldn't crash R +A = data.table(A='a') +B = data.table(B='b') +test(2139.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector") +test(2139.2, merge(A, B, by=character(0) ), error = "non-empty vector of column names for `by` is required.") +test(2139.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names are required") From e85e836a20b60d6eeff93569334bc0664cc0e98b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Sun, 1 Mar 2020 21:12:33 +0100 Subject: [PATCH 04/10] Added news entry --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 71fd76aa65..5daa81a968 100644 --- a/NEWS.md +++ b/NEWS.md @@ -107,6 +107,8 @@ unit = "s") 12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR. +13. `X[Y, on=character(0)]` and `merge(X, Y, by.x=character(0), by.y=character(0))` no longer crash. Thanks to @tlapak for the PR. + ## NOTES 0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. From 3e5ae0bd67557b4d22ce2d1de5be34e9c93ecf00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Sun, 1 Mar 2020 21:44:25 +0100 Subject: [PATCH 05/10] Reflect grammar change to error message in tests --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 85a71baa04..8fb2848872 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15869,7 +15869,7 @@ test(2074.31, dcast(DT, V1 ~ z, fun.aggregate=eval(quote(length)), value.var='z' test(2074.32, fwrite(DT, logical01=TRUE, logicalAsInt=TRUE), error="logicalAsInt has been renamed") # merge.data.table -test(2074.33, merge(DT, DT, by.x = 1i, by.y=1i), error="A non-empty vector of column names are required") +test(2074.33, merge(DT, DT, by.x = 1i, by.y=1i), error="A non-empty vector of column names is required") # shift naming test(2074.34, shift(list(a=1:5, b=6:10), give.names=TRUE), list(a_lag_1=c(NA, 1:4), b_lag_1=c(NA, 6:9))) @@ -16852,4 +16852,4 @@ A = data.table(A='a') B = data.table(B='b') test(2139.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector") test(2139.2, merge(A, B, by=character(0) ), error = "non-empty vector of column names for `by` is required.") -test(2139.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names are required") +test(2139.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required") From bdfb6945057fcd9683fb251dd0f0a732f0a6659b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Mon, 2 Mar 2020 06:24:35 +0100 Subject: [PATCH 06/10] Added link to news file linking the PR --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 5daa81a968..51fabb621e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -107,7 +107,7 @@ unit = "s") 12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR. -13. `X[Y, on=character(0)]` and `merge(X, Y, by.x=character(0), by.y=character(0))` no longer crash. Thanks to @tlapak for the PR. +13. `X[Y, on=character(0)]` and `merge(X, Y, by.x=character(0), by.y=character(0))` no longer crash, [#4272](https://github.com/Rdatatable/data.table/pull/4272). Thanks to @tlapak for the PR. ## NOTES From cba5c55d74ff6861f62726302fb7bee447bae005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Mon, 25 May 2020 22:04:53 +0200 Subject: [PATCH 07/10] Add check in bmerge.c --- src/bmerge.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/bmerge.c b/src/bmerge.c index 15d7d6f4f7..746c1bc426 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -47,6 +47,8 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE i = iArg; x = xArg; // set globals so bmerge_r can see them. if (!isInteger(icolsArg)) error(_("Internal error: icols is not integer vector")); // # nocov if (!isInteger(xcolsArg)) error(_("Internal error: xcols is not integer vector")); // # nocov + if (LENGTH(icolsArg) == 0 || LENGTH(xcolsArg) == 0) + error(_("Internal error: icols and xcols must be non-empty vectors.")); if (LENGTH(icolsArg) > LENGTH(xcolsArg)) error(_("Internal error: length(icols) [%d] > length(xcols) [%d]"), LENGTH(icolsArg), LENGTH(xcolsArg)); // # nocov icols = INTEGER(icolsArg); xcols = INTEGER(xcolsArg); From 008e472153934120313275fcd9f05ce6fbfcdbd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Mon, 25 May 2020 22:09:47 +0200 Subject: [PATCH 08/10] Add tests for check in bmerge.c --- inst/tests/tests.Rraw | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8fb2848872..1af8d83a02 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16853,3 +16853,5 @@ B = data.table(B='b') test(2139.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector") test(2139.2, merge(A, B, by=character(0) ), error = "non-empty vector of column names for `by` is required.") test(2139.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required") +# Also shouldn't crash when using internal functions +test(2139.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), error = 'icols and xcols must be non-empty') From 153f712f70fea7db182dfd8371383ff9c39e0e5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Sat, 6 Jun 2020 21:43:17 +0200 Subject: [PATCH 09/10] Update to pass 2126.* --- src/bmerge.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bmerge.c b/src/bmerge.c index 746c1bc426..d823a63561 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -47,8 +47,8 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE i = iArg; x = xArg; // set globals so bmerge_r can see them. if (!isInteger(icolsArg)) error(_("Internal error: icols is not integer vector")); // # nocov if (!isInteger(xcolsArg)) error(_("Internal error: xcols is not integer vector")); // # nocov - if (LENGTH(icolsArg) == 0 || LENGTH(xcolsArg) == 0) - error(_("Internal error: icols and xcols must be non-empty vectors.")); + if ((LENGTH(icolsArg) == 0 || LENGTH(xcolsArg) == 0) && LENGTH(i) > 0) // We let through LENGTH(i) == 0 for tests 2126.* + error(_("Internal error: icols and xcols must be non-empty integer vectors.")); if (LENGTH(icolsArg) > LENGTH(xcolsArg)) error(_("Internal error: length(icols) [%d] > length(xcols) [%d]"), LENGTH(icolsArg), LENGTH(xcolsArg)); // # nocov icols = INTEGER(icolsArg); xcols = INTEGER(xcolsArg); @@ -70,7 +70,7 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE roll = 0.0; rollToNearest = FALSE; if (isString(rollarg)) { if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) error(_("roll is character but not 'nearest'")); - if (TYPEOF(VECTOR_ELT(i, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet.")); + if (ncol > 0 && TYPEOF(VECTOR_ELT(i, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet.")); roll=1.0; rollToNearest=TRUE; // the 1.0 here is just any non-0.0, so roll!=0.0 can be used later } else { if (!isReal(rollarg)) error(_("Internal error: roll is not character or double")); // # nocov From 04cedfdddf61e91b7421705a89a21a7de2c5898a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Sat, 6 Jun 2020 21:53:09 +0200 Subject: [PATCH 10/10] Expand tests 2126 --- inst/tests/tests.Rraw | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1af8d83a02..6f80f4b995 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16639,6 +16639,9 @@ options(old_width) DT = data.table(A="a", key="A") test(2126.1, DT[J(NULL)], DT[0]) test(2126.2, DT[data.table()], DT[0]) +# additional segfault when i is NULL and roll = 'nearest' +test(2126.3, DT[J(NULL), roll = 'nearest'], DT[0]) +test(2126.4, DT[data.table(), roll = 'nearest'], DT[0]) # fcase, #3823 test_vec1 = -5L:5L < 0L