diff --git a/NEWS.md b/NEWS.md index 454abff8c6..fa5fa3e79b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -247,6 +247,8 @@ 35. `DT[, i-1L, with=FALSE]` would misinterpret the minus sign and return an incorrect result, [#2019](https://github.com/Rdatatable/data.table/issues/2109). Thanks @cguill95 for the report. +36. `DT[id==1, DT2[.SD, on="id"]]` (i.e. joining from `.SD` in `j`) could incorrectly fail in some cases due to `.SD` being locked, [#1926](https://github.com/Rdatatable/data.table/issues/1926). Thanks @franknarf1 for the report and for diligently tracking use cases for almost 3 years! + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/bmerge.R b/R/bmerge.R index 2ff0f86524..e14a03a922 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -10,7 +10,14 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos # Otherwise, the type of the i column is always returned. i = shallow(i) + # merge on .SD in i fails _sometimes_ because of set() being done here, #1926 + .Call(C_unlock, i) x = shallow(x) + .Call(C_unlock, x) + if (.Call(C_islocked, callersi)) { + .Call(C_unlock, callersi) + on.exit(.Call(C_lock, callersi)) + } # careful to only plonk syntax (full column) on i/x from now on otherwise user's i and x would change; # this is why shallow() is very importantly internal only, currently. diff --git a/R/data.table.R b/R/data.table.R index fa594c79db..6cc4b9cde9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1004,7 +1004,7 @@ replace_order = function(isub, verbose, env) { newnames = NULL suppPrint = identity if (length(av) && av[1L] == ":=") { - if (identical(attr(x, ".data.table.locked", exact=TRUE), TRUE)) stop(".SD is locked. Using := in .SD's j is reserved for possible future use; a tortuously flexible way to modify by group. Use := in j directly to modify by group by reference.") + if (.Call(C_islocked, x)) stop(".SD is locked. Using := in .SD's j is reserved for possible future use; a tortuously flexible way to modify by group. Use := in j directly to modify by group by reference.") suppPrint = function(x) { .global$print=address(x); x } # Suppress print when returns ok not on error, bug #2376. Thanks to: http://stackoverflow.com/a/13606880/403310 # All appropriate returns following this point are wrapped; i.e. return(suppPrint(x)). @@ -1255,8 +1255,8 @@ replace_order = function(isub, verbose, env) { # Temp fix for #921. Allocate `.I` only if j-expression uses it. SDenv$.I = if (!missing(j) && use.I) seq_len(SDenv$.N) else 0L SDenv$.GRP = 1L - setattr(SDenv$.SD,".data.table.locked",TRUE) # used to stop := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=. - setattr(SDenv$.SDall,".data.table.locked",TRUE) + .Call(C_lock, SDenv$.SD) # used to stop := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=. + .Call(C_lock, SDenv$.SDall) lockBinding(".SD",SDenv) lockBinding(".SDall",SDenv) lockBinding(".N",SDenv) @@ -1271,11 +1271,11 @@ replace_order = function(isub, verbose, env) { } jval = eval(jsub, SDenv, parent.frame()) - .Call(Csetattrib, jval, '.data.table.locked', NULL) # in case jval inherits .SD's lock, #1341 #2245. Use .Call not setattr() to avoid bumping jval's MAYBE_SHARED. + .Call(C_unlock, jval) # in case jval inherits .SD's lock, #1341 #2245. .Call directly (not via an R function like setattr or unlock) to avoid bumping jval's MAYBE_SHARED. # copy 'jval' when required # More speedup - only check + copy if irows is NULL - # Temp fix for #921 - check address and copy *after* evaluating 'jval' + # Temp fix for #921 - check address and copy *after* evaluating 'jval'. #5114 also related. if (is.null(irows)) { if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg. jcpy = address(jval) %in% vapply_1c(SDenv$.SD, address) # %chin% errors when RHS is list() @@ -1326,10 +1326,6 @@ replace_order = function(isub, verbose, env) { setnames(jval, jvnames) } - # fix for bug #5114 from GSee's - .data.table.locked=TRUE. # TO DO: more efficient way e.g. address==address (identical will do that but then proceed to deep compare if !=, wheras we want just to stop?) - # Commented as it's taken care of above, along with #921 fix. Kept here for the bug fix info and TO DO. - # if (identical(jval, SDenv$.SD)) return(copy(jval)) - if (is.data.table(jval)) { setattr(jval, 'class', class(x)) # fix for #5296 if (haskey(x) && all(key(x) %chin% names(jval)) && suppressWarnings(is.sorted(jval, by=key(x)))) # TO DO: perhaps this usage of is.sorted should be allowed internally then (tidy up and make efficient) @@ -1457,8 +1453,8 @@ replace_order = function(isub, verbose, env) { setattr(SDenv$.SD,"row.names",c(NA_integer_,0L)) } # .set_row_names() basically other than not integer() for 0 length, otherwise dogroups has no [1] to modify to -.N - setattr(SDenv$.SD,".data.table.locked",TRUE) # used to stop := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=. - setattr(SDenv$.SDall,".data.table.locked",TRUE) + .Call(C_lock, SDenv$.SD) # stops := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=. + .Call(C_lock, SDenv$.SDall) lockBinding(".SD",SDenv) lockBinding(".SDall",SDenv) lockBinding(".N",SDenv) @@ -2116,9 +2112,7 @@ transform.data.table = function (`_data`, ...) inx = chmatch(tags, names(`_data`)) matched = !is.na(inx) if (any(matched)) { - if (isTRUE(attr(`_data`, ".data.table.locked", exact=TRUE))) { - setattr(`_data`, ".data.table.locked", NULL) # fix for #1641, now covered by test 104.2 - } + .Call(C_unlock, `_data`) # fix for #1641, now covered by test 104.2 `_data`[,inx[matched]] = e[matched] `_data` = as.data.table(`_data`) } @@ -2299,7 +2293,8 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR )) # handle nested split if (flatten || length(by) == 1L) { - lapply(lapply(ll, setattr, '.data.table.locked', NULL), setDT) + for (x in ll) .Call(C_unlock, x) + lapply(ll, setDT) # alloc.col could handle DT in list as done in: c9c4ff80bdd4c600b0c4eff23b207d53677176bd } else if (length(by) > 1L) { lapply(ll, split.data.table, drop=drop, by=by[-1L], sorted=sorted, keep.by=keep.by, flatten=flatten) @@ -2318,14 +2313,14 @@ copy = function(x) { anydt = vapply(x, is.data.table, TRUE, USE.NAMES=FALSE) if (sum(anydt)) { newx[anydt] = lapply(newx[anydt], function(x) { - setattr(x, ".data.table.locked", NULL) + .Call(C_unlock, x) alloc.col(x) }) } } return(newx) # e.g. in as.data.table.list() the list is copied before changing to data.table } - setattr(newx,".data.table.locked",NULL) + .Call(C_unlock, newx) alloc.col(newx) } @@ -2372,9 +2367,8 @@ point = function(to, to_idx, from, from_idx) { setattr(ans, "sorted", NULL) setattr(ans, "index", NULL) } - if (unlock) setattr(ans, '.data.table.locked', NULL) + if (unlock) .Call(C_unlock, ans) ans - } shallow = function(x, cols=NULL) { @@ -2834,7 +2828,7 @@ isReallyReal = function(x) { #' ATTENTION: If nothing else helps, an auto-index is created on x unless options prevent this. if(getOption("datatable.optimize") < 3L) return(NULL) ## at least level three optimization required. if (!is.call(isub)) return(NULL) - if (!is.null(attr(x, '.data.table.locked', exact=TRUE))) return(NULL) # fix for #958, don't create auto index on '.SD'. + if (.Call(C_islocked, x)) return(NULL) # fix for #958, don't create auto index on '.SD'. ## a list of all possible operators with their translations into the 'on' clause validOps = list(op = c("==", "%in%", "%chin%"), on = c("==", "==", "==")) diff --git a/R/setkey.R b/R/setkey.R index 0ffdf23a46..7e8f2f1e5d 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -49,7 +49,7 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU } if (!is.data.table(x)) stop("x is not a data.table") if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setkey.") - if (physical && identical(attr(x, ".data.table.locked", exact=TRUE),TRUE)) stop("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.") + if (physical && .Call(C_islocked, x)) stop("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.") if (!length(cols)) { warning("cols is a character vector of zero length. Removed the key, but use NULL instead, or wrap with suppressWarnings() to avoid this warning.") setattr(x,"sorted",NULL) diff --git a/cc.R b/cc.R index 7fb83d2fae..c805287e37 100644 --- a/cc.R +++ b/cc.R @@ -76,9 +76,9 @@ cc = function(test=TRUE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir=Sys.getenv setwd(old) xx = getDLLRegisteredRoutines("datatable",TRUE) for (i in seq_along(xx$.Call)) - assign(xx$.Call[[i]]$name, xx$.Call[[i]]$address, env=.GlobalEnv) + assign(xx$.Call[[i]]$name, xx$.Call[[i]]$address, envir=.GlobalEnv) for (i in seq_along(xx$.External)) - assign(xx$.External[[i]]$name, xx$.External[[i]]$address, env=.GlobalEnv) + assign(xx$.External[[i]]$name, xx$.External[[i]]$address, envir=.GlobalEnv) sourceDir(paste0(cc_dir,"/R")) assign("testDir", function(x)paste0(cc_dir,"/inst/tests/",x), envir=.GlobalEnv) .onLoad() diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0a6b04f73f..8a308c19c8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15814,6 +15814,19 @@ DT = data.table(a=1, b=2) i = 2L test(2091, DT[ , i-1L, with=FALSE], data.table(a=1)) +# #1926 -- unlock .SD for bmerge in j +DT = data.table(id=1:2, v=3:4) +DT2 = data.table(id=1, x=5) +DT3 = copy(DT2) +## DT2.id is numeric so bmerge does coercion with set() +test(2092.1, DT[id == 1, DT2[.SD, on="id"]], data.table(id=1L, x=5, v=3L)) +DT[id == 1, x := DT2[.SD, x, on="id"]] +DT[id == 1, x := 4] +test(2092.2, DT2$x, DT3$x) +df1 = data.table(a=1:5, b=c(0, 0, 1, 0, 2)) +df2 = data.table(c=c(1, 1, 2, 2, 3), d=c(3, 4, 3, 5, 4)) +test(2092.3, copy(df2)[ , s := df1[.SD, on=.(a >= c, a <= d), sum(b), by=.EACHI]$V1], + df2[ , s := c(1, 1, 1, 3, 1)]) ################################### # Add new tests above this line # diff --git a/src/assign.c b/src/assign.c index 3838558d36..d0076d1e63 100644 --- a/src/assign.c +++ b/src/assign.c @@ -280,14 +280,13 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // rows : row numbers to assign R_len_t i, j, numToDo, targetlen, vlen, r, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; SEXP targetcol, names, nullint, thisv, targetlevels, newcol, s, colnam, tmp, colorder, key, index, a, assignedNames, indexNames; - SEXP bindingIsLocked = getAttrib(dt, install(".data.table.locked")); bool verbose=GetVerbose(), anytodelete=false; const char *c1, *tc1, *tc2; int *buf, newKeyLength, indexNo; size_t size; // must be size_t otherwise overflow later in memcpy if (isNull(dt)) error("assign has been passed a NULL dt"); if (TYPEOF(dt) != VECSXP) error("dt passed to assign isn't type VECSXP"); - if (length(bindingIsLocked) && LOGICAL(bindingIsLocked)[0]) + if (islocked(dt)) error(".SD is locked. Updating .SD by reference using := or set are reserved for future use. Use := in j directly. Or use copy(.SD) as a (slow) last resort, until shallow() is exported."); // We allow set() on data.frame too; e.g. package Causata uses set() on a data.frame in tests/testTransformationReplay.R diff --git a/src/data.table.h b/src/data.table.h index 469b9e76e7..a5579400c4 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -87,6 +87,7 @@ SEXP sym_maxgrpn; SEXP sym_colClassesAs; SEXP sym_verbose; SEXP sym_inherits; +SEXP sym_datatable_locked; long long DtoLL(double x); double LLtoD(long long x); bool GetVerbose(); @@ -217,4 +218,8 @@ SEXP coerceFillR(SEXP fill); bool INHERITS(SEXP x, SEXP char_); bool Rinherits(SEXP x, SEXP char_); void copySharedColumns(SEXP x); +SEXP lock(SEXP x); +SEXP unlock(SEXP x); +bool islocked(SEXP x); +SEXP islockedR(SEXP x); diff --git a/src/init.c b/src/init.c index 378743a19a..d98a6234f1 100644 --- a/src/init.c +++ b/src/init.c @@ -83,6 +83,9 @@ SEXP nafillR(); SEXP colnamesInt(); SEXP initLastUpdated(); SEXP cj(); +SEXP lock(); +SEXP unlock(); +SEXP islockedR(); // .Externals SEXP fastmean(); @@ -170,10 +173,12 @@ R_CallMethodDef callMethods[] = { {"Ccj", (DL_FUNC) &cj, -1}, {"Ccoalesce", (DL_FUNC) &coalesce, -1}, {"CfifelseR", (DL_FUNC) &fifelseR, -1}, +{"C_lock", (DL_FUNC) &lock, -1}, // _ for these 3 to avoid Clock as in time +{"C_unlock", (DL_FUNC) &unlock, -1}, +{"C_islocked", (DL_FUNC) &islockedR, -1}, {NULL, NULL, 0} }; - static const R_ExternalMethodDef externalMethods[] = { {"Cfastmean", (DL_FUNC) &fastmean, -1}, @@ -300,6 +305,7 @@ void attribute_visible R_init_datatable(DllInfo *info) sym_verbose = install("datatable.verbose"); SelfRefSymbol = install(".internal.selfref"); sym_inherits = install("inherits"); + sym_datatable_locked = install(".data.table.locked"); initDTthreads(); avoid_openmp_hang_within_fork(); diff --git a/src/subset.c b/src/subset.c index 204c490497..7891a36dd5 100644 --- a/src/subset.c +++ b/src/subset.c @@ -324,7 +324,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { for (int j=0; j 5, rank_in_team := frank(ERA), by = .(teamID, yearID)] Pitching[rank_in_team == 1, team_performance := - # this should work without needing copy(); - # that it doesn't appears to be a bug: - # https://github.com/Rdatatable/data.table/issues/1926 - Teams[copy(.SD), Rank, on = c('teamID', 'yearID')]] + Teams[.SD, Rank, on = c('teamID', 'yearID')]] ``` Note that the `x[y]` syntax returns `nrow(y)` values (i.e., it's a right join), which is why `.SD` is on the right in `Teams[.SD]` (since the RHS of `:=` in this case requires `nrow(Pitching[rank_in_team == 1])` values.