diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index f6d68ce682..5110794770 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -231,7 +231,20 @@ test.list <- atime::atime_test_list( }, expr = data.table:::melt(DT, measure.vars = measure.vars), Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue - Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue + Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue # Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns." + + # Test case created from @tdhock's comment https://github.com/Rdatatable/data.table/pull/6393#issuecomment-2327396833, in turn adapted from @philippechataignon's comment https://github.com/Rdatatable/data.table/pull/6393#issuecomment-2326714012 + "fwrite refactored in #6393" = atime::atime_test( + setup = { + set.seed(1) + NC = 10L + L <- data.table(i=1:N) + L[, paste0("V", 1:NC) := replicate(NC, rnorm(N), simplify=FALSE)] + out.csv <- tempfile() + }, + expr = data.table::fwrite(L, out.csv, compress="gzip"), + Before = "f339aa64c426a9cd7cf2fcb13d91fc4ed353cd31", # Parent of the first commit https://github.com/Rdatatable/data.table/commit/fcc10d73a20837d0f1ad3278ee9168473afa5ff1 in the PR https://github.com/Rdatatable/data.table/pull/6393/commits with major change to fwrite with gzip. + PR = "3630413ae493a5a61b06c50e80d166924d2ef89a"), # Close-to-last merge commit in the PR. tests=extra.test.list) # nolint end: undesirable_operator_linter. diff --git a/.ci/linters/c/alloc_linter.R b/.ci/linters/c/alloc_linter.R index 8fb6b2dd05..3e72d102d1 100644 --- a/.ci/linters/c/alloc_linter.R +++ b/.ci/linters/c/alloc_linter.R @@ -4,8 +4,9 @@ # 2. Check the next line for a check like 'if (!x || !y)' alloc_linter = function(c_obj) { lines = c_obj$lines - # Be a bit more precise to avoid mentions in comments - alloc_lines = grep(R"{=\s*([(]\w+\s*[*][)])?[mc]alloc[(]}", lines) + # Be a bit more precise to avoid mentions in comments, and allow + # malloc(0) to be used for convenience (e.g. #6757) + alloc_lines = grep(R"{=\s*([(]\w+\s*[*][)])?[mc]alloc[(][^0]}", lines) if (!length(alloc_lines)) return() # int *tmp=(int*)malloc(...); or just int tmp=malloc(...); alloc_keys = lines[alloc_lines] |> diff --git a/.devcontainer/r-devel-alpine/Dockerfile b/.devcontainer/r-devel-alpine/Dockerfile new file mode 100644 index 0000000000..39194563d3 --- /dev/null +++ b/.devcontainer/r-devel-alpine/Dockerfile @@ -0,0 +1,22 @@ +FROM docker.io/rhub/r-minimal:devel + +RUN apk update \ + && apk add --no-cache \ + gcc git musl-dev openmp pkgconf tzdata zlib-dev \ + && echo 'options("repos"="https://cloud.r-project.org")' >> /usr/local/lib/R/etc/Rprofile.site + +ENV TZDIR=/usr/share/zoneinfo + +COPY DESCRIPTION . + +RUN Rscript -e ' \ +read.dcf("DESCRIPTION", c("Imports", "Suggests")) |> \ + tools:::.split_dependencies() |> \ + names() |> \ + setdiff(tools:::.get_standard_package_names()$base) |> \ + install.packages(repos="https://cloud.r-project.org") \ +' + +# setup cc() +WORKDIR /root +COPY .devcontainer/.Rprofile . diff --git a/.devcontainer/r-devel-alpine/devcontainer.json b/.devcontainer/r-devel-alpine/devcontainer.json new file mode 100644 index 0000000000..74c039fb7a --- /dev/null +++ b/.devcontainer/r-devel-alpine/devcontainer.json @@ -0,0 +1,9 @@ +{ + "build": { "dockerfile": "Dockerfile", "context": "../.." }, + "customizations": { "vscode": { + "extensions": [ + "REditorSupport.r", + "ms-vscode.cpptools-extension-pack" + ] + }} + } diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE/issue_template.md index 5be83b9035..ea74b0cf3d 100644 --- a/.github/ISSUE_TEMPLATE/issue_template.md +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -5,13 +5,18 @@ about: Report a bug or describe a new requested feature Click preview tab ^^^ above! -By continuing to file this new issue / feature request, I confirm I have : +By continuing to file this new issue / feature request, I confirm I have: 1. searched the [live NEWS file](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) to see if it has been fixed in dev already. If so, I tried the [latest dev version](https://github.com/Rdatatable/data.table/wiki/Installation#windows). 2. looked at the titles of all the issues in the [current milestones](https://github.com/Rdatatable/data.table/milestones) and am aware of those. (Adding new information to existing issues is very helpful and appreciated.) 3. [searched all issues](https://github.com/Rdatatable/data.table/issues) (i.e. not in a milestone yet) for similar issues to mine and will include links to them explaining why mine is different. 4. searched on [Stack Overflow's data.table tag](http://stackoverflow.com/questions/tagged/data.table) and there is nothing similar there. 5. read the [Support](https://github.com/Rdatatable/data.table/wiki/Support) and [Contributing](https://github.com/Rdatatable/data.table/blob/master/.github/CONTRIBUTING.md) guides. -6. please don't tag your issue with text in the title; project members will add the appropriate tags later. + +Some general advice on the title and description fields for your PR + +- Please don't tag your issue with text in the title like '[Joins]'; project members will add the appropriate tags later. +- Don't write text like 'Closes #xxx' in the PR title either; GitHub does not recognize this text, whereas GitHub auto-links issues in the description, [see docs](https://docs.github.com/en/issues/tracking-your-work-with-issues/using-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword). +- Title and Description fields should try and be self-contained as much as possible. The title answers "what is this change" and the description provides necessary details/thought processes/things tried but abandoned. Imagine visiting your PR in 5 years' time and trying to glean what it's all about quickly and without needing to open 10 new tabs. #### Thanks! Please remove the text above and include the two items below. diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml index 19ee0c3270..5e9692ff40 100644 --- a/.github/workflows/code-quality.yaml +++ b/.github/workflows/code-quality.yaml @@ -70,6 +70,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - uses: r-lib/actions/setup-r@v2 - name: Lint run: for (f in list.files('.ci/linters/md', full.names=TRUE)) source(f) shell: Rscript {0} diff --git a/.github/workflows/pkgup.yaml b/.github/workflows/pkgup.yaml index f2080dcdac..8eeedf8021 100644 --- a/.github/workflows/pkgup.yaml +++ b/.github/workflows/pkgup.yaml @@ -50,7 +50,7 @@ jobs: cp -R ${{ env.R_LIBS_USER }} library R CMD INSTALL --library="library" $(ls -1t data.table_*.tar.gz | head -n 1) --html mkdir -p doc/html - cp /usr/share/R/doc/html/{left.jpg,up.jpg,Rlogo.svg,R.css,index.html} doc/html + cp $(R RHOME)/doc/html/{left.jpg,up.jpg,Rlogo.svg,R.css,index.html} doc/html Rscript -e 'utils::make.packages.html("library", docdir="doc")' sed -i "s|file://|../..|g" doc/html/packages.html mkdir -p public diff --git a/CODEOWNERS b/CODEOWNERS index 6d12041c39..1572fa3e56 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -68,6 +68,11 @@ # docs /man/openmp-utils.Rd @Anirban166 /Seal_of_Approval.md @tdhock +/GOVERNANCE.md: @Rdatatable/committers # GLCI .gitlab-ci.yml @jangorecki @ben-schwen + +# C code tricks +/src/chmatch.c @aitap +/src/fread.c @aitap diff --git a/GOVERNANCE.md b/GOVERNANCE.md index 33449a3674..6c9ce58620 100644 --- a/GOVERNANCE.md +++ b/GOVERNANCE.md @@ -94,8 +94,9 @@ A pull request can be merged by any committer, if there is one approving review, ## Changing this GOVERNANCE.md document -There is no special process for changing this document (submit a PR -and ask for review). +There is no special process for changing this document. Submit a PR and ask for review; the group `@Rdatatable/committers` will automatically be assigned to ensure all current Committers are aware of the change. + +Please also make a note in the change log under [`# Governance history`](#governance-history) # Code of conduct @@ -123,6 +124,8 @@ data.table Version line in DESCRIPTION typically has the following meanings # Governance history +Jan 2025: clarify that edits to governance should notify all committers. + Feb 2024: change team name/link maintainers to committers, to be consistent with role defined in governance. Nov-Dec 2023: initial version drafted by Toby Dylan Hocking and diff --git a/NAMESPACE b/NAMESPACE index 2497f0cf9d..2341ac9356 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,7 +5,7 @@ importFrom(methods, "S3Part<-", slotNames) exportClasses(data.table, IDate, ITime) ## -export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy) +export(data.table, tables, setkey, setkeyv, key, haskey, CJ, SJ, copy) export(rowwiseDT) export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) diff --git a/NEWS.md b/NEWS.md index fa384dc105..bdc61fd5fd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -69,6 +69,10 @@ rowwiseDT( 6. `fread()` gains `logicalYN` argument to read columns consisting only of strings `Y`, `N` as `logical` (as opposed to character), [#4563](https://github.com/Rdatatable/data.table/issues/4563). The default is controlled by option `datatable.logicalYN`, itself defaulting to `FALSE`, for back-compatibility -- some smaller tables (especially sharded tables) might inadvertently read a "true" string column as `logical` and cause bugs. This is particularly important for tables with a column named `y` or `n` -- automatic header detection under `logicalYN=TRUE` will see these values in the first row as being "data" as opposed to column names. A parallel option was not included for `fwrite()` at this time -- users looking for a compact representation of logical columns can still use `fwrite(logical01=TRUE)`. We also opted for now to check only `Y`, `N` and not `Yes`/`No`/`YES`/`NO`. +7. `fwrite()` with `compress="gzip"` produces compatible gz files when composed of multiple independent chunks owing to parallelization, [#6356](https://github.com/Rdatatable/data.table/issues/6356). Earlier `fwrite()` versions could have issues with HTTP upload using `Content-Encoding: gzip` and `Transfer-Encoding: chunked`. Thanks to @oliverfoster for report and @philippechataignon for the fix. Thanks also @aitap for pre-release testing that found some possible memory leaks in the initial fix. + +8. `fwrite()` gains a new parameter `compressLevel` to control compression level for gzip, [#5506](https://github.com/Rdatatable/data.table/issues/5506). This parameter balances compression speed and total compression, and corresponds directly to the analogous command-line parameter, e.g. `compressLevel=4` corresponds to passing `-4`; the default, `6`, matches the command-line default, i.e. equivalent to passing `-6`. Thanks @mgarbuzov for the request and @philippechataignon for implementing. + ## BUG FIXES 1. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR. @@ -95,7 +99,7 @@ rowwiseDT( # [1] "V1" "b" "c" ``` -4. Queries like `DT[, min(x):max(x)]` now work as expected, i.e. the same as `DT[, seq(min(x), max(x))]` or `with(DT, min(x):max(x))`, [#2069](https://github.com/Rdatatable/data.table/issues/2069). Shorthand like `DT[, a:b]` meaning "select from columns `a` through `b`" still works. Thanks to @franknarf1 for reporting, @jangorecki for the fix, and @MichaelChirico for a follow-up ensuring back-compatibility. +4. Queries like `DT[, min(x):max(x)]` now work as expected, i.e. the same as `DT[, seq(min(x), max(x))]` or `with(DT, min(x):max(x))`, [#2069](https://github.com/Rdatatable/data.table/issues/2069). Shorthand like `DT[, a:b]` meaning "select from columns `a` through `b`" still works. Thanks to @franknarf1 for reporting, @jangorecki for the fix, and @MichaelChirico for follow-ups ensuring back-compatibility. 5. `fread()` performance improves when specifying `Date` among `colClasses`, [#6105](https://github.com/Rdatatable/data.table/issues/6105). One implication of the change is that the column will be an `IDate` (which also inherits from `Date`), which may affect code strongly relying on the column class to be `Date` exactly; computations with `IDate` and `Date` columns should otherwise be the same. If you strongly prefer the `Date` class, run `as.Date()` explicitly following `fread()`. Thanks @scipima for the report and @MichaelChirico for the fix. @@ -111,17 +115,46 @@ rowwiseDT( 11. `tables()` now returns the correct size for data.tables over 2GiB, [#6607](https://github.com/Rdatatable/data.table/issues/6607). Thanks to @vlulla for the report and the PR. -12. Joins on multiple columns, such as `x[y, on=c("x1==y1", "x2==y1")]`, could fail during implicit type coercions if `x1` and `x2` had different but still compatible types, [#6602](https://github.com/Rdatatable/data.table/issues/6602). This was particularly unexpected when columns `x1`, `x2`, and `y1` were all of the same class, e.g. `Date`, but differed in their underlying storage types. Thanks to Benjamin Schwendinger for the report and the fix. - -13. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix. +12. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix. -14. Added a `data.frame` method for `format_list_item()` to fix error printing data.tables with columns containing 1-column data.frames, [#6592](https://github.com/Rdatatable/data.table/issues/6592). Thanks to @r2evans for the bug report and fix. +13. Added a `data.frame` method for `format_list_item()` to fix error printing data.tables with columns containing 1-column data.frames, [#6592](https://github.com/Rdatatable/data.table/issues/6592). Thanks to @r2evans for the bug report and fix. -15. Auto-printing gets some substantial improvements +14. Auto-printing gets some substantial improvements - Suppression in `knitr` documents is now done by implementing a method for `knit_print` instead of looking up the call stack, [#6589](https://github.com/Rdatatable/data.table/pull/6589). The old way was fragile and wound up broken by some implementation changes in {knitr}. Thanks to @jangorecki for the report [#6509](https://github.com/Rdatatable/data.table/issues/6509) and @aitap for the fix. - `print()` methods for S3 subclasses of data.table (e.g. an object of class `c("my.table", "data.table", "data.frame")`) no longer print where plain data.tables wouldn't, e.g. `myDT[, y := 2]`, [#3029](https://github.com/Rdatatable/data.table/issues/3029). The improved detection of auto-printing scenarios has the added benefit of _allowing_ print in highly explicit statements like `print(DT[, y := 2])`, obviating our recommendation since v1.9.6 to append `[]` to signal "please print me". -16. Joins of `integer64` and `double` columns succeed when the `double` column has lossless `integer64` representation, [#4167](https://github.com/Rdatatable/data.table/issues/4167) and [#6625](https://github.com/Rdatatable/data.table/issues/6625). Previously, this only worked when the double column had lossless _32-bit_ integer representation. Thanks @MichaelChirico for the reports and fix. +15. Joins of `integer64` and `double` columns succeed when the `double` column has lossless `integer64` representation, [#4167](https://github.com/Rdatatable/data.table/issues/4167) and [#6625](https://github.com/Rdatatable/data.table/issues/6625). Previously, this only worked when the double column had lossless _32-bit_ integer representation. Thanks @MichaelChirico for the reports and fix. + +16. `DT[order(...)]` better matches `base::order()` behavior by (1) recognizing the `method=` argument (and erroring since this is not supported) and (2) accepting a vector of `TRUE`/`FALSE` in `decreasing=` as an alternative to using `-a` to convey "sort `a` decreasing", [#4456](https://github.com/Rdatatable/data.table/issues/4456). Thanks @jangorecki for the FR and @MichaelChirico for the PR. + +17. Assignment with `:=` to an S4 slot of an under-allocated data.table now works, [#6704](https://github.com/Rdatatable/data.table/issues/6704). Thanks @MichaelChirico for the report and fix. + +18. `as.data.table()` method for `data.frame`s (especially those with extended classes) is more consistent with `as.data.frame()` with respect to rention of attributes, [#5699](https://github.com/Rdatatable/data.table/issues/5699). Thanks @jangorecki for the report and fix. + +19. Grouped queries on keyed tables no longer return an incorrectly keyed result if the _ad hoc_ `by=` list has some function call (in particular, a function which happens to return a strictly decreasing function of the keys), e.g. `by=.(a = rev(a))`, [#5583](https://github.com/Rdatatable/data.table/issues/5583). Thanks @AbrJA for the report and @MichaelChirico for the fix. + +20. Assigning `list(NULL)` to a list column now replaces the column with `list(NULL)`, instead of deleting the column [#5558](https://github.com/Rdatatable/data.table/issues/5558). This behavior is now consistent with base `data.frame`. Thanks @tdhock for the report and @joshhwuu for the fix. This is due to a fundamental ambiguity from both allowing list columns _and_ making the use of `list()` to wrap `j=` arguments optional. We think that the code behaves as expected in all cases now. See the below for some illustration: + + ```r + DT = data.table(L=list(1L), i=2L, c='a') + + DT[, i := NULL] # delete i + DT[, L := NULL] # delete L + + DT[, i := list(NULL)] # overwrite: identical(DT$i, list(NULL)) + # ^ ** THIS IS A CHANGE FROM PREVIOUS BEHAVIOR WHICH WOULD DELETE i ** + DT[, L := list(NULL)] # assignment: identical(DT$L, list(NULL)) + + DT[, i := .(3L)] # assignment: identical(DT$i, 3L) + DT[, i := .('a')] # overwrite: identical(DT$i, 'a') + DT[, L := .(list(NULL))] # assignment: identical(DT$L, list(NULL)) + + DT[, c('L', 'i') := list(NULL, NULL)] # delete L,i + DT[, c('L', 'i') := list(list(NULL), 3L)] # assignment: identical(DT$L, list(NULL)), identical(DT$i, 3L) + DT[, c('L', 'i') := list(NULL, 3L)] # delete L, assign to i + DT[, c('L', 'i') := list(list(NULL), NULL)] # assign to L, delete i + ``` +21. An integer overflow in `fread()` with lines longer than `2^(31/2)` bytes is prevented, [#6729](https://github.com/Rdatatable/data.table/issues/6729). The typical impact was no worse than a wrong initial allocation size, corrected later. Thanks to @TaikiSan21 for the report and @aitap for the fix. ## NOTES @@ -139,6 +172,23 @@ rowwiseDT( 7. The `dcast()` and `melt()` generics no longer attempt to redirect to {reshape2} methods when passed non-`data.table`s. If you're still using {reshape2}, you must use namespace-qualification: `reshape2::dcast()`, `reshape2::melt()`. We have been warning about the deprecation since v1.12.4 (2019). Please note that {reshape2} is retired. +8. `showProgress` in `[` is disabled for "trivial" grouping (`.NGRP==1L`), [#6668](https://github.com/Rdatatable/data.table/issues/6668). Thanks @MichaelChirico for the request and @joshhwuu for the PR. + +9. `key<-`, marked as deprecated since 2012 and unusable since v1.15.0, has been fully removed. + +10. The following in-progress deprecations have proceeded: + + + Using `fwrite(logicalAsInt=)` has been upgraded from a warning (since v1.15.0) to an error. It will be removed in the next release. + + Using `fread(autostart=)` has been upgraded to an error. It has been warning since v1.11.0 (6 years ago). The argument will be removed in the next release. + + Using `droplevels(in.place=TRUE)` (warning since v1.16.0) has been upgraded from warning to error. The argument will be removed in the next release. + + Use of `:=` and `with=FALSE` in `[` has been upgraded from warning (since v1.15.0) to error. Long ago (before 2014), this was needed when, e.g., assigning to a vector of column names defined outside the table, but `with=FALSE` is no longer needed to do so: `DT[, (cols) := ...]` works fine. + +# data.table [v1.16.4](https://github.com/Rdatatable/data.table/milestone/36) 4 December 2024 + +## BUG FIXES + +1. Joins on multiple columns, such as `x[y, on=c("x1==y1", "x2==y1")]`, could fail during implicit type coercions if `x1` and `x2` had different but still compatible types, [#6602](https://github.com/Rdatatable/data.table/issues/6602). This was particularly unexpected when columns `x1`, `x2`, and `y1` were all of the same class, e.g. `Date`, but differed in their underlying storage types. Thanks to Benjamin Schwendinger for the report and the fix. + # data.table [v1.16.2](https://github.com/Rdatatable/data.table/milestone/35) (9 October 2024) ## BUG FIXES @@ -286,7 +336,7 @@ rowwiseDT( 5. Input files are now kept open during `mmap()` when running under Emscripten, [emscripten-core/emscripten#20459](https://github.com/emscripten-core/emscripten/issues/20459). This avoids an error in `fread()` when running in WebAssembly, [#5969](https://github.com/Rdatatable/data.table/issues/5969). Thanks to @maek-ies for the report and @georgestagg for the PR. -6. `dcast()` improves behavior for the situation that the `fun.aggregate` value of `length()` is used but not provided by the user. +6. `dcast()` improves behavior for the situation that the `fun.aggregate` value of `length()` is used but not provided by the user. a. This now triggers a warning, not a message, since relying on this default often signals unexpected duplicates in the data, [#5386](https://github.com/Rdatatable/data.table/issues/5386). The warning is classed as `dt_missing_fun_aggregate_warning`, allowing for more targeted handling in user code. Thanks @MichaelChirico for the suggestion and @Nj221102 for the fix. @@ -1001,7 +1051,7 @@ rowwiseDT( 14. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively. -15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). +15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463). 16. `key(x) <- value` is now fully deprecated (from warning to error). Use `setkey()` to set a table's key. We started warning not to use this approach in 2012, with a stronger warning starting in 2019 (1.12.2). This function will be removed in the next release. diff --git a/R/as.data.table.R b/R/as.data.table.R index 3137f75322..09c026aea5 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -214,6 +214,8 @@ as.data.table.list = function(x, } as.data.table.data.frame = function(x, keep.rownames=FALSE, key=NULL, ...) { + if (is.data.table(x)) return(as.data.table.data.table(x)) # S3 is weird, #6739. Also # nocov; this is tested in 2302.{2,3}, not sure why it doesn't show up in coverage. + if (!identical(class(x), "data.frame")) return(as.data.table(as.data.frame(x))) if (!isFALSE(keep.rownames)) { # can specify col name to keep.rownames, #575; if it's the same as key, # kludge it to 'rn' since we only apply the new name afterwards, #4468 diff --git a/R/data.table.R b/R/data.table.R index 6594cb928c..8ef2924fef 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -459,9 +459,9 @@ replace_dot_alias = function(e) { if (!len_common_names) stopf("Attempting to do natural join but no common columns in provided tables") if (verbose) { which_cols_msg = if (len_common_names == length(x)) { - catf("Joining but 'x' has no key, natural join using all 'x' columns") + catf("Joining but 'x' has no key, natural join using all 'x' columns\n") } else { - catf("Joining but 'x' has no key, natural join using: %s", brackify(common_names)) + catf("Joining but 'x' has no key, natural join using: %s\n", brackify(common_names)) } } on = common_names @@ -685,19 +685,11 @@ replace_dot_alias = function(e) { # j was substituted before dealing with i so that := can set allow.cartesian=FALSE (#800) (used above in i logic) if (is.null(jsub)) return(NULL) - if (!with && jsub %iscall% ":=") { - # TODO: make these both errors (or single long error in both cases) in next release. - # i.e. using with=FALSE together with := at all will become an error. Eventually with will be removed. - if (is.null(names(jsub)) && is.name(jsub[[2L]])) { - warningf("with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] to assign to column name(s) held in variable myVar. See ?':=' for other examples. As warned in 2014, this is now a warning.") - jsub[[2L]] = eval(jsub[[2L]], parent.frame(), parent.frame()) - } else { - warningf("with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.") - } - with = TRUE - } - if (!with) { + if (jsub %iscall% ":=") { + # TODO(>=1.18.0): Simplify this error + stopf("with=FALSE together with := was deprecated in v1.9.4 released Oct 2014; this has been warning since v1.15.0. Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] to assign to column name(s) held in variable myVar. See ?':=' for other examples.") + } # missingby was already checked above before dealing with i if (jsub %iscall% c("!", "-") && length(jsub)==2L) { # length 2 to only match unary, #2109 notj = TRUE @@ -1153,7 +1145,8 @@ replace_dot_alias = function(e) { } } names(jsub)="" - jsub[[1L]]=as.name("list") + # dont wrap the RHS in list if it is a singular NULL and if not creating a new column + if (length(jsub[-1L]) == 1L && as.character(jsub[-1L]) == 'NULL' && all(lhs %chin% names_x)) jsub[[1L]]=as.name("identity") else jsub[[1L]]=as.name("list") } av = all.vars(jsub,TRUE) if (!is.atomic(lhs)) stopf("LHS of := must be a symbol, or an atomic vector (column names or positions).") @@ -1221,7 +1214,7 @@ replace_dot_alias = function(e) { setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope if (is.name(name)) { assign(as.character(name),x,parent.frame(),inherits=TRUE) - } else if (name %iscall% c('$', '[[') && is.name(name[[2L]])) { + } else if (.is_simple_extraction(name)) { # TODO(#6702): use a helper here as the code is very similar to setDT(). k = eval(name[[2L]], parent.frame(), parent.frame()) if (is.list(k)) { origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame()) @@ -1233,6 +1226,8 @@ replace_dot_alias = function(e) { .Call(Csetlistelt,k,as.integer(j), x) } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) { assign(as.character(name[[3L]]), x, k, inherits=FALSE) + } else if (isS4(k)) { + .Call(CsetS4elt, k, as.character(name[[3L]]), x) } } # TO DO: else if env$<- or list$<- } @@ -2012,8 +2007,8 @@ replace_dot_alias = function(e) { if (verbose) {last.started.at=proc.time();catf("setkey() afterwards for keyby=.EACHI ... ");flush.console()} setkeyv(ans,names(ans)[seq_along(byval)]) if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} - } else if (keyby || (haskey(x) && bysameorder && (byjoin || (length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars))))))) { - setattr(ans,"sorted",names(ans)[seq_along(grpcols)]) + } else if (.by_result_is_keyable(x, keyby, bysameorder, byjoin, allbyvars, bysub)) { + setattr(ans, "sorted", names(ans)[seq_along(grpcols)]) } setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line } @@ -2077,8 +2072,6 @@ as.matrix.data.table = function(x, rownames=NULL, rownames.value=NULL, ...) { if (!is.null(rownames)) { if (!is.null(rownames.value)) stopf("rownames and rownames.value cannot both be used at the same time") if (length(rownames)>1L) { - # TODO in future as warned in NEWS for 1.11.6: - # warningf("length(rownames)>1 is deprecated. Please use rownames.value= instead") if (length(rownames)!=nrow(x)) stopf("length(rownames)==%d but nrow(DT)==%d. The rownames argument specifies a single column name or number. Consider rownames.value= instead.", length(rownames), nrow(x)) rownames.value = rownames @@ -2969,7 +2962,7 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { if (is.name(name)) { name = as.character(name) assign(name, x, parent.frame(), inherits=TRUE) - } else if (name %iscall% c('$', '[[') && is.name(name[[2L]])) { + } else if (.is_simple_extraction(name)) { # common case is call from 'lapply()' k = eval(name[[2L]], parent.frame(), parent.frame()) if (is.list(k)) { @@ -2981,10 +2974,18 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { stopf("Item '%s' not found in names of input list", origj) } } - .Call(Csetlistelt,k,as.integer(j), x) + .Call(Csetlistelt, k, as.integer(j), x) } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) { assign(as.character(name[[3L]]), x, k, inherits=FALSE) + } else if (isS4(k)) { + .Call(CsetS4elt, k, as.character(name[[3L]]), x) } + } else if (name %iscall% "get") { # #6725 + # edit 'get(nm, env)' call to be 'assign(nm, x, envir=env)' + name = match.call(get, name) + name[[1L]] = quote(assign) + name$value = x + eval(name, parent.frame(), parent.frame()) } .Call(CexpandAltRep, x) # issue#2866 and PR#2882 invisible(x) @@ -3043,13 +3044,31 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { ids } +.by_result_is_keyable = function(x, keyby, bysameorder, byjoin, byvars, bysub) { + if (keyby) return(TRUE) + k = key(x) + if (is.null(k)) return(FALSE) # haskey(x) but saving 'k' for below + if (!bysameorder) return(FALSE) + if (byjoin) return(TRUE) + if (!length(byvars)) return(FALSE) + if (!identical(byvars, head(k, length(byvars)))) return(FALSE) # match key exactly, in order + # For #5583, we also ensure there are no function calls in by (which might break sortedness) + if (is.name(bysub)) return(TRUE) + if (identical(bysub[[1L]], quote(list))) bysub = bysub[-1L] + if (length(all.names(bysub)) > length(byvars)) return(FALSE) + TRUE +} + .is_withFALSE_range = function(e, x, root=root_name(e), vars=all.vars(e)) { if (root != ":") return(FALSE) if (!length(vars)) return(TRUE) # e.g. 1:10 if (!all(vars %chin% names(x))) return(TRUE) # e.g. 1:ncol(x) - is.name(e[[1L]]) && is.name(e[[2L]]) # e.g. V1:V2, but not min(V1):max(V2) or 1:max(V2) + !is.call(e[[2L]]) && !is.call(e[[3L]]) # e.g. V1:V2, but not min(V1):max(V2) or 1:max(V2) } +# for assignments like x[[1]][, a := 2] or setDT(x@DT) +.is_simple_extraction = function(e) e %iscall% c('$', '@', '[[') && is.name(e[[2L]]) + # GForce functions # to add a new function to GForce (from the R side -- the easy part!): # (1) add it to gfuns diff --git a/R/fdroplevels.R b/R/fdroplevels.R index 3116a3f85f..642b76e090 100644 --- a/R/fdroplevels.R +++ b/R/fdroplevels.R @@ -8,10 +8,9 @@ fdroplevels = function(x, exclude = if (anyNA(levels(x))) NULL else NA, ...) { return(ans) } -droplevels.data.table = function(x, except=NULL, exclude, in.place=FALSE, ...){ - stopifnot(is.logical(in.place)) - if (isTRUE(in.place)) warningf("droplevels() with in.place=TRUE is deprecated. Use setdroplevels() instead.") - if (!in.place) x = copy(x) +droplevels.data.table = function(x, except=NULL, exclude, in.place=NULL, ...){ + if (!is.null(in.place)) stopf("droplevels() with in.place=TRUE is deprecated. Use setdroplevels() instead.") + x = copy(x) if (missing(exclude)) exclude = NULL setdroplevels(x, except, exclude)[] } diff --git a/R/fread.R b/R/fread.R index 64247d7aca..5bf2a97845 100644 --- a/R/fread.R +++ b/R/fread.R @@ -7,7 +7,7 @@ showProgress=getOption("datatable.showProgress",interactive()), data.table=getOp nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), logicalYN=getOption("datatable.logicalYN", FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE), -yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") +yaml=FALSE, autostart=NULL, tmpdir=tempdir(), tz="UTC") { if (missing(input)+is.null(file)+is.null(text)+is.null(cmd) < 3L) stopf("Used more than one of the arguments input=, file=, text= and cmd=.") input_has_vars = length(all.vars(substitute(input)))>0L # see news for v1.11.6 @@ -124,7 +124,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") input = file } - if (!missing(autostart)) warningf("'autostart' is now deprecated and ignored. Consider skip='string' or skip=n"); + if (!is.null(autostart)) stopf("'autostart' is deprecated. Consider skip='string' or skip=n. This argument will be removed in the next release."); if (is.logical(colClasses)) { if (!allNA(colClasses)) stopf("colClasses is type 'logical' which is ok if all NA but it has some TRUE or FALSE values in it which is not allowed. Please consider the drop= or select= argument instead. See ?fread.") colClasses = NULL diff --git a/R/fwrite.R b/R/fwrite.R index d67c886d13..660b87f9d7 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -4,12 +4,13 @@ fwrite = function(x, file="", append=FALSE, quote="auto", na="", dec=".", row.names=FALSE, col.names=TRUE, qmethod=c("double","escape"), logical01=getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS - logicalAsInt=logical01, + logicalAsInt=NULL, scipen=getOption('scipen', 0L), dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB=8, nThread=getDTthreads(verbose), showProgress=getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), + compressLevel = 6L, yaml = FALSE, bom = FALSE, verbose=getOption("datatable.verbose", FALSE), @@ -18,22 +19,17 @@ fwrite = function(x, file="", append=FALSE, quote="auto", if (length(encoding) != 1L || !encoding %chin% c("", "UTF-8", "native")) { stopf("Argument 'encoding' must be '', 'UTF-8' or 'native'.") } - if (missing(qmethod)) qmethod = qmethod[1L] - if (missing(compress)) compress = compress[1L] - if (missing(dateTimeAs)) { dateTimeAs = dateTimeAs[1L] } - else if (length(dateTimeAs)>1L) stopf("dateTimeAs must be a single string") - dateTimeAs = chmatch(dateTimeAs, c("ISO","squash","epoch","write.csv"))-1L - if (is.na(dateTimeAs)) stopf("dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") - if (!missing(logical01) && !missing(logicalAsInt)) - stopf("logicalAsInt has been renamed logical01. Use logical01 only, not both.") - if (!missing(logicalAsInt)) { - warningf("logicalAsInt has been renamed logical01 for consistency with fread. It works fine for now but please change to logical01 at your convenience so we can remove logicalAsInt in future.") - logical01 = logicalAsInt - logicalAsInt=NULL + qmethod = match.arg(qmethod) + compress = match.arg(compress) + dateTimeAs = match.arg(dateTimeAs) + dateTimeAs = chmatch(dateTimeAs, c("ISO", "squash", "epoch", "write.csv")) - 1L + if (!is.null(logicalAsInt)) { + stopf("logicalAsInt has been renamed logical01 for consistency with fread.") } scipen = if (is.numeric(scipen)) as.integer(scipen) else 0L buffMB = as.integer(buffMB) nThread = as.integer(nThread) + compressLevel = as.integer(compressLevel) # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' # validate arguments if (is.matrix(x)) { # coerce to data.table if input object is matrix @@ -46,7 +42,8 @@ fwrite = function(x, file="", append=FALSE, quote="auto", x = as.data.table(x) } } - stopifnot(is.list(x), + stopifnot( + is.list(x), identical(quote,"auto") || isTRUEorFALSE(quote), is.character(sep) && length(sep)==1L && (nchar(sep) == 1L || identical(sep, "")), is.character(sep2) && length(sep2)==3L && nchar(sep2[2L])==1L, @@ -55,6 +52,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", is.character(eol) && length(eol)==1L, length(qmethod) == 1L && qmethod %chin% c("double", "escape"), length(compress) == 1L && compress %chin% c("auto", "none", "gzip"), + length(compressLevel) == 1L && 0 <= compressLevel && compressLevel <= 9, isTRUEorFALSE(col.names), isTRUEorFALSE(append), isTRUEorFALSE(row.names), isTRUEorFALSE(verbose), isTRUEorFALSE(showProgress), isTRUEorFALSE(logical01), isTRUEorFALSE(bom), @@ -62,7 +60,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", is.character(file) && length(file)==1L && !is.na(file), length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024L, length(nThread)==1L && !is.na(nThread) && nThread>=1L - ) + ) is_gzip = compress == "gzip" || (compress == "auto" && endsWithAny(file, ".gz")) @@ -119,7 +117,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, bom, yaml, verbose, encoding) + showProgress, is_gzip, compressLevel, bom, yaml, verbose, encoding) invisible() } diff --git a/R/setkey.R b/R/setkey.R index 11b02ec2de..6ba1df4c85 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -18,11 +18,6 @@ setindexv = function(x, cols, verbose=getOption("datatable.verbose")) { } } -# Has been warning since 2012, with stronger warning in Mar 2019 (note in news for 1.12.2); #3399 -"key<-" = function(x,value) { - stopf("key(x)<-value is deprecated and not supported. Please change to use setkey() with perhaps copy(). Has been warning since 2012.") -} - setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRUE) { if (is.null(cols)) { # this is done on a data.frame when !cedta at top of [.data.table @@ -160,8 +155,10 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, .Call(CforderReuseSorting, x, by, retGrp, retStats, sort, order, na.last, reuseSorting) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } -forder = function(..., na.last=TRUE, decreasing=FALSE) +forder = function(..., na.last=TRUE, decreasing=FALSE, method="radix") { + if (method != "radix") stopf("data.table has no support for sorting by method='%s'. Use base::order(), not order(), if you really need this.", method) + stopifnot(is.logical(decreasing), length(decreasing) > 0L, !is.na(decreasing)) sub = substitute(list(...)) tt = vapply_1b(sub, function(x) is.null(x) || (is.symbol(x) && !nzchar(x))) if (any(tt)) sub[tt] = NULL # remove any NULL or empty arguments; e.g. test 1962.052: forder(DT, NULL) and forder(DT, ) @@ -169,8 +166,8 @@ forder = function(..., na.last=TRUE, decreasing=FALSE) asc = rep.int(1L, length(sub)-1L) # ascending (1) or descending (-1) per column # the idea here is to intercept - (and unusual --+ deriving from built expressions) before vectors in forder(DT, -colA, colB) so that : # 1) - on character vector works; ordinarily in R that fails with type error - # 2) each column/expression can have its own +/- more easily that having to use a separate decreasing=TRUE/FALSE - # 3) we can pass the decreasing (-) flag to C and avoid what normally happens in R; i.e. allocate a new vector and apply - to every element first + # 2) each column/expression can have its own +/- more easily than having to use a separate decreasing=TRUE/FALSE + # 3) we can pass the decreasing (-) flag to C and avoid what normally happens in R; i.e. allocate a new vector and negate every element first # We intercept the unevaluated expressions and massage them before evaluating in with(DT) scope or not depending on the first item. for (i in seq.int(2L, length(sub))) { v = sub[[i]] @@ -193,8 +190,16 @@ forder = function(..., na.last=TRUE, decreasing=FALSE) } else { data = eval(sub, parent.frame(), parent.frame()) } - stopifnot(isTRUEorFALSE(decreasing)) - o = forderv(data, seq_along(data), retGrp=FALSE, retStats=FALSE, sort=TRUE, order=if (decreasing) -asc else asc, na.last=na.last) + if (length(decreasing) > 1L) { + if (any(asc < 0L)) stopf("Mixing '-' with vector decreasing= is not supported.") + if (length(decreasing) != length(asc)) stopf("decreasing= has length %d applied to sorting %d columns.", length(decreasing), length(asc)) + orderArg = fifelse(decreasing, -asc, asc) + } else if (decreasing) { + orderArg = -asc + } else { + orderArg = asc + } + o = forderv(data, seq_along(data), retGrp=FALSE, retStats=FALSE, sort=TRUE, order=orderArg, na.last=na.last) if (!length(o) && length(data)>=1L) o = seq_along(data[[1L]]) else o o } diff --git a/configure b/configure index 2402ecef1f..8e450c10b8 100755 --- a/configure +++ b/configure @@ -34,15 +34,11 @@ else NOZLIB=0 lib=`pkg-config --libs zlib` cflag=`pkg-config --cflags zlib` - expr -- "$lib" : ".*-lz$" >> config.log # -- for FreeBSD, #4652 + echo "$lib" | grep -qE '[-]lz($| )' >> config.log if [ $? -ne 0 ]; then - expr -- "$lib" : ".*-lz " >> config.log - # would use \b in one expr but MacOS does not support \b - if [ $? -ne 0 ]; then - echo "*** pkg-config is installed and 'pkg-config --exists zlib' succeeds but" - echo "*** 'pkg-config --libs zlib' returns '${lib}' which does not include the standard -lz." - msg=1 - fi + echo "*** pkg-config is installed and 'pkg-config --exists zlib' succeeds but" + echo "*** 'pkg-config --libs zlib' returns '${lib}' which does not include the standard -lz." + msg=1 fi fi fi diff --git a/inst/tests/S4.Rraw b/inst/tests/S4.Rraw index 7e0b8111d6..19a595c08b 100644 --- a/inst/tests/S4.Rraw +++ b/inst/tests/S4.Rraw @@ -109,3 +109,12 @@ DT = data.table(a = rep(1:2, c(1, 100))) # Set the S4 bit on a simple object DT[, b := asS4(seq_len(.N))] test(6, DT[, b, by=a, verbose=TRUE][, isS4(b)], output="dogroups: growing") + +# setDT() works for a data.frame slot, #6701 +setClass("DataFrame", slots=c(x="data.frame")) +DF = new("DataFrame", x=data.frame(a=1)) +setDT(DF@x) +test(7.1, is.data.table(DF@x)) +# Similar code for under-allocated data.tables in S4 slots, #6704 +setClass("DataTable", slots=c(x="data.table")) +test(7.2, options=c(datatable.alloccol=0L), {DT = new("DataTable", x=data.table(a=1)); DT@x[, b := 2L]; DT@x$b}, 2L) # NB: requires assigning DT to test assignment back to that object diff --git a/inst/tests/issue_6729.txt.bz2 b/inst/tests/issue_6729.txt.bz2 new file mode 100644 index 0000000000..4b96e11f3f Binary files /dev/null and b/inst/tests/issue_6729.txt.bz2 differ diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d5b1622839..7f02bf4b66 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -159,6 +159,9 @@ TZnotUTC = !identical(tt,"") && !is_utc(tt) # (3) function factory for matching messages exactly by substituting anything between delimiters [delim, fmt=TRUE] # (4) function factory for matching messages exactly by substituting a generic string [fmt=string] get_msg = function(e, delim, fmt=FALSE) { + ufq = options(useFancyQuotes = FALSE) # otherwise we get angled quotes, hard to match robustly + on.exit(options(ufq)) + condition = tryCatch({e; NULL}, error=identity, warning=identity) if (is.null(condition)) return(condition) msg = condition$message @@ -170,17 +173,13 @@ get_msg = function(e, delim, fmt=FALSE) { sprintf("%s%s%s", delim[1L], if (fmt) "%s" else ".+", delim[2L]), msg ) - if (fmt) return(function(x) sprintf(msg, x)) + if (fmt) return(function(...) sprintf(msg, ...)) return(msg) } base_messages = list( missing_object = get_msg(`__dt_test_missing_` + 1, "'", fmt=TRUE), missing_function = get_msg(`__dt_test_missing_`(), '"', fmt=TRUE), - missing_coerce_method = get_msg(delim = '"', { - old = options(useFancyQuotes = FALSE) # otherwise we get angled quotes, hard to match robustly - on.exit(options(old)) - methods::as(TRUE, 'foo') - }), + missing_coerce_method = get_msg(methods::as(TRUE, 'foo'), delim = '"'), missing_dispatch_method = get_msg(conditionMessage(structure(1, class="foo")), '[\'"]'), invalid_arg_unary_operator = get_msg(-'a'), invalid_arg_binary_operator = get_msg(1 + 'a'), @@ -199,6 +198,8 @@ base_messages = list( stopifnot = get_msg(stopifnot(FALSE), fmt="FALSE"), not_yet_used = get_msg(.NotYetUsed("abc"), "'", fmt=TRUE), # NB: need fmt= because the English message has '(yet)' --> parens in regex ambiguous_date_fmt = get_msg(as.Date('xxx')), + match_arg_length = get_msg(match.arg(c('a', 'b'), letters)), + match_arg_4_choices = get_msg(match.arg('e', letters[1:4]), delim='"', fmt=TRUE), NULL ) @@ -1645,11 +1646,7 @@ test(504, dtB$k, 1:4) # or dtB DT = data.table(a=1:2,b=1:4,key="a") test(505, DT[J(a=1,b=6),sum(i.b*b),by=.EACHI]$V1, 24) # 24 now 'double' because i.b is 'double' -# Test := after a key<- -DT = data.table(a=3:1,b=4:6) -test(506, key(DT)<-"a", error="deprecated") - -# tests 508, 509, 510 related to follow-up operations after key<-, which are now irrelevant +# tests 506, 508, 509, 510 related to follow-up operations after key<-, which are now irrelevant # Test new functions chmatch and %chin% y=letters @@ -2908,7 +2905,7 @@ test(966, fread(input, colClasses=list(character=2:4)), data.table(A=1:2, B=c("f # Character input more than 4096 bytes (used to be passed through path.expand which imposed the limit), #2649 test(967, nrow(fread( strrep('a\tb\n', 10000L), header=FALSE)), 10000L) -# Test fread warns about removal of any footer (and autostart skips up over it) +# Test fread warns about removal of any footer (and skip= skips up over it) test(968, fread("A,B\n1,3\n2,4\n\nRowcount: 2\n"), data.table(A=1:2,B=3:4), warning="Discarded single-line footer.*Rowcount: 2") test(969, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2"), data.table(A=1:2,B=3:4), warning="Discarded single-line footer.*Rowcount: 2") test(970, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2\n\n"), data.table(A=1:2,B=3:4), warning="Discarded single-line footer.*Rowcount: 2") @@ -4658,7 +4655,7 @@ test(1241, DT[order(x,-y)], # optimized to forder() DT = data.table(a=1:3, b=4:6) myCol = "a" -test(1242.1, DT[2,myCol:=6L,with=FALSE], data.table(a=INT(1,6,3), b=4:6), warning="with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please") +test(1242.1, DT[2,myCol:=6L,with=FALSE], error="with=FALSE together with := was deprecated in v1.9.4") test(1242.2, DT[2,(myCol):=7L], data.table(a=INT(1,7,3), b=4:6)) # consistency of output type of mult, #340 @@ -8452,11 +8449,9 @@ baseR = base::order(c(x2,x1,x1,x2)) # are so relaxed now that they barely testing anything. It appears base R behaviour is undefined in this rare case of identical strings in different encodings. test(1590.04, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) || identical(baseR, 1:4)) Encoding(x2) = "unknown" -# TODO(#6350): Decide if this test should be adjusted for Alpine Linux, or just dropped. -if (!file.exists("/etc/alpine-release")) { - test(1590.05, x1!=x2) - test(1590.06, forderv( c(x2,x1,x1,x2)), INT(1,4,2,3)) # consistent with Windows-1252 result, tested further below -} +# test(1590.05, x1!=x2) # Skip this test of R's own behavior since R itself does not give platform-consistent results, #6350 +# TODO(#6350): Restore this test. On Alpine, forder() finds this input to be sorted, but we want to be platform-independent. +if (!file.exists("/etc/alpine-release")) test(1590.06, forderv( c(x2,x1,x1,x2)), INT(1,4,2,3)) # consistent with Windows-1252 result, tested further below baseR = base::order(c(x2,x1,x1,x2)) test(1590.07, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) || identical(baseR, 1:4)) Sys.setlocale("LC_CTYPE", ctype) @@ -10012,7 +10007,7 @@ test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing bom .false., ya test(1658.28, fwrite(ok_dt, 1), error=base_messages$stopifnot("is.character(file) && length(file) == 1L && !is.na(file)")) test(1658.29, fwrite(ok_dt, quote=123), error="identical\\(quote.*auto.*FALSE.*TRUE") test(1658.30, fwrite(ok_dt, sep="..."), error="nchar(sep)") -test(1658.31, fwrite(ok_dt, qmethod=c("double", "double")), error="length(qmethod)") +test(1658.31, fwrite(ok_dt, qmethod=c("double", "double")), error=base_messages$match_arg_length) test(1658.32, fwrite(ok_dt, col.names="foobar"), error="isTRUEorFALSE(col.names)") # null data table (no columns) @@ -10054,8 +10049,12 @@ if (!haszlib()) { test(1658.423, file.info(f1)$size < file.info(f2)$size) # 74 < 804 (file.size() isn't available in R 3.1.0) if (test_R.utils) test(1658.43, fread(f1), DT) # use fread to decompress gz (works cross-platform) fwrite(DT, file=f3<-tempfile(), compress="gzip") # compress to filename not ending .gz + fwrite(DT, file=f4<-tempfile(), compress="gzip", compressLevel=1) # test compressLevel + fwrite(DT, file=f5<-tempfile(), compress="gzip", compressLevel=9) test(1658.441, file.info(f3)$size, file.info(f1)$size) - unlink(c(f1,f2,f3)) + test(1658.442, file.info(f4)$size >= file.info(f1)$size) + test(1658.443, file.info(f1)$size >= file.info(f5)$size) + unlink(c(f1,f2,f3,f4,f5)) } DT = data.table(a=1:3, b=list(1:4, c(3.14, 100e10), c("foo", "bar", "baz"))) test(1658.45, fwrite(DT), output=c("a,b","1,1|2|3|4","2,3.14|1e+12","3,foo|bar|baz")) @@ -10104,6 +10103,23 @@ test(1658.58, fwrite(DT), output='a,b\n1,0\\+1i\n2,-1-1i\n3,$') test(1658.59, fwrite(data.table(a=list('a')), verbose=TRUE), output='fields will be quoted if the field contains either sep.*sep2.*list column') test(1658.60, fwrite(data.table(r=as.raw(0))), error = "'raw' - not yet implemented") +## naLen is bigger than col width +test(1658.61, fwrite(data.table(a="a"), na="VERY LONG MISSING VALUE STRING !", quote=FALSE, verbose=TRUE), + output="maxLineLen=66") +## eol="" error +test(1658.62, fwrite(data.table(a=1), tempfile(), eol=''), error='eol must be 1 or more bytes') + +## buffMB < single line width and < header width +f = tempfile() +test(1658.63, fwrite(data.table(a=strrep('x', 2**21)), f, buffMB=1.0), NULL) +test(1658.64, file.size(f) > 2**20) # almost exactly 2**21, but exact number will vary by platform. we just care we didn't truncate at 1MiB. +DT=data.table(1L) +setnames(DT, strrep('y', 2**21)) +test(1658.65, fwrite(DT, f, buffMB=1.0, nThread=1L), NULL) +test(1658.66, file.size(f) > 2**20) +unlink(f) + +test(1658.67, fwrite(data.table(a=numeric()), verbose=TRUE), output='No data rows present') options(oldverbose) ## End fwrite tests @@ -10562,7 +10578,11 @@ test(1702.1, isoweek(test_cases), test_values) # but calculating from Date brings these into play, #2407 test(1702.2, isoweek(as.Date(test_cases)), test_values) -# *** OBSCURE ERROR WHEN Sys.timezone() = 'America/Argentina/Buenos_Aires' *** +# *** OBSCURE DST ERROR WHEN Sys.timezone() = 'America/Argentina/Buenos_Aires' *** +# *** 00:00 does not exist, h/t rikivillalba@ who worked through this *** +# *** https://techcommunity.microsoft.com/blog/dstblog/argentina-is-changing-their-daylight-saving-time-on-december-30/311020 *** +# *** https://mm.icann.org/pipermail/tz/2007-December/014743.html *** +# *** Official IANA sources: https://www.iana.org/time-zones *** test(1702.3, isoweek(as.POSIXct(test_cases)), test_values) # 1% sample of a 400-year cycle of dates for extra robustness @@ -10937,10 +10957,7 @@ test(1736.03, fwrite(DT, sep2=c("",",","")), error="sep.*,.*sep2.*,.*must all be test(1736.04, fwrite(DT, sep2=c("","||","")), error="nchar.*sep2.*2") test(1736.05, capture.output(fwrite(DT, sep='|', sep2=c("c(",",",")"), logical01=FALSE)), c("A|B|C", "1|c(1,2,3,4,5,6,7,8,9,10)|c(s,t,u,v,w)", "2|c(15,16,17,18)|c(1.2,2.3,3.4,3.14159265358979,-9)", "3|c(7)|c(foo,bar)", "4|c(9,10)|c(TRUE,TRUE,FALSE)")) -test(1736.06, capture.output(fwrite(DT, sep='|', sep2=c("{",",","}"), logicalAsInt=TRUE)), - c("A|B|C", "1|{1,2,3,4,5,6,7,8,9,10}|{s,t,u,v,w}", - "2|{15,16,17,18}|{1.2,2.3,3.4,3.14159265358979,-9}", "3|{7}|{foo,bar}", "4|{9,10}|{1,1,0}"), - warning="logicalAsInt has been renamed logical01") +test(1736.06, fwrite(DT, sep='|', sep2=c("{",",","}"), logicalAsInt=TRUE), error="logicalAsInt has been renamed logical01") DT = data.table(A=c("foo","ba|r","baz")) test(1736.07, capture.output(fwrite(DT,na="")), c("A","foo","ba|r","baz")) # no list column so no need to quote test(1736.08, capture.output(fwrite(DT)), c("A","foo","ba|r","baz")) @@ -10978,9 +10995,9 @@ DT = data.table( D = as.POSIXct(dt<-paste(d,t), tz="UTC"), E = as.POSIXct(paste0(dt,c(".999",".0",".5",".111112",".123456",".023",".0",".999999",".99",".0009")), tz="UTC")) -test(1740.0, fwrite(DT,dateTimeAs="iso"), error="dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") -test(1740.1, fwrite(DT,dateTimeAs=c("ISO","squash")), error="dateTimeAs must be a single string") -test(1740.2, capture.output(fwrite(DT,dateTimeAs="ISO")), c( +test(1740.1, fwrite(DT,dateTimeAs="iso"), error=base_messages$match_arg_4_choices("ISO", "squash", "epoch", "write.csv")) +test(1740.2, fwrite(DT,dateTimeAs=c("ISO","squash")), error=base_messages$match_arg_length) +test(1740.3, capture.output(fwrite(DT,dateTimeAs="ISO")), c( "A,B,C,D,E", "1907-10-21,1907-10-21,23:59:59,1907-10-21T23:59:59Z,1907-10-21T23:59:59.999Z", "1907-10-22,1907-10-22,00:00:00,1907-10-22T00:00:00Z,1907-10-22T00:00:00Z", @@ -10992,7 +11009,7 @@ test(1740.2, capture.output(fwrite(DT,dateTimeAs="ISO")), c( "1999-12-31,1999-12-31,01:23:45,1999-12-31T01:23:45Z,1999-12-31T01:23:45.999999Z", "2000-02-29,2000-02-29,23:59:59,2000-02-29T23:59:59Z,2000-02-29T23:59:59.990Z", "2016-09-12,2016-09-12,01:30:30,2016-09-12T01:30:30Z,2016-09-12T01:30:30.000900Z")) -test(1740.3, capture.output(fwrite(DT,dateTimeAs="squash")), c( +test(1740.4, capture.output(fwrite(DT,dateTimeAs="squash")), c( "A,B,C,D,E", "19071021,19071021,235959,19071021235959000,19071021235959999", "19071022,19071022,000000,19071022000000000,19071022000000000", @@ -11004,7 +11021,7 @@ test(1740.3, capture.output(fwrite(DT,dateTimeAs="squash")), c( "19991231,19991231,012345,19991231012345000,19991231012345999", "20000229,20000229,235959,20000229235959000,20000229235959990", "20160912,20160912,013030,20160912013030000,20160912013030000")) -test(1740.4, capture.output(fwrite(DT,dateTimeAs="epoch")), c( +test(1740.5, capture.output(fwrite(DT,dateTimeAs="epoch")), c( "A,B,C,D,E", "-22718,-22718,86399,-1962748801,-1962748800.001", "-22717,-22717,0,-1962748800,-1962748800", @@ -13184,7 +13201,7 @@ test(1925.10, as.ITime(x), structure(c(12L, 67L), class="ITime")) test(1925.11, as.ITime(x, ms='nearest'), structure(c(12L, 68L), class="ITime")) test(1925.12, as.ITime(x, ms='ceil'), structure(c(13L, 68L), class="ITime")) -test(1936.1, fread("A,B\n1,3\n2,4", autostart=1), data.table(A=1:2, B=3:4), warning="autostart.*deprecated.*Consider skip") +test(1936.1, fread("A,B\n1,3\n2,4", autostart=1), error="autostart.*deprecated.*Consider skip") if (.Platform$OS.type == "unix") test(1936.2, is.data.table(fread("ls ."))) # add helpful error to %between% @@ -13614,8 +13631,9 @@ test(1962.0482, forder(L), 3:1) test(1962.0483, forder(), NULL) setDT(DT) test(1962.049, forder(DT[ , 0L]), error = 'Attempting to order a 0-column') -test(1962.050, forder(DT, decreasing = NA), error = base_messages$stopifnot('isTRUEorFALSE(decreasing)')) -test(1962.051, forder(DT, decreasing = 1.4), error = base_messages$stopifnot('isTRUEorFALSE(decreasing)')) +test(1962.0500, forder(DT, decreasing = NA), error = base_messages$stopifnot('!is.na(decreasing)')) +test(1962.0510, forder(DT, decreasing = 1.4), error = base_messages$stopifnot('is.logical(decreasing)')) +test(1962.0511, forder(DT, decreasing=logical()), error=base_messages$stopifnot('length(decreasing) > 0L')) test(1962.052, forder(DT, NULL), 3:1) test(1962.053, forder(DT), 3:1) test(1962.054, forder(DT, ), 3:1) @@ -13935,8 +13953,8 @@ test(1967.42, x[3, rollends = rep(TRUE, 10L)], error = 'rollends must be length test(1967.43, x[ , ..], error = 'symbol .. is invalid') test(1967.44, x[NULL], data.table(NULL)) test(1967.45, x[ , NULL], NULL) -test(1967.46, x[ , 'b' := 6:10, with = FALSE], - data.table(a = 1:5, b = 6:10), warning = 'with=FALSE ignored') +test(1967.46, x[ , 'b' := 6:10, with=FALSE], error='with=FALSE together with :=') +x[, b := 6:10] test(1967.47, x[ , -1L, with = FALSE], data.table(b = 6:10)) test(1967.48, x[ , b, .SDcols = 'a'], 6:10, warning = "This j doesn't use .SD") @@ -15429,7 +15447,7 @@ L = list(1:3, NULL, 4:6) test(2058.18, length(L), 3L) test(2058.19, as.data.table(L), data.table(V1=1:3, V2=4:6)) # V2 not V3 # no DT = data.table(a=1:3, b=c(4,5,6)) -test(2058.20, DT[,b:=list(NULL)], data.table(a=1:3)) # no +test(2058.20, DT[,b:=list(NULL)], data.table(a=1:3, b=list(NULL))) # no # rbindlist improved error message, #3638 DT = data.table(a=1) @@ -15961,8 +15979,7 @@ DT[ , z := 0L] test(2074.31, dcast(DT, V1 ~ z, fun.aggregate=eval(quote(length)), value.var='z'), data.table(V1=c('a', 'b'), `0`=2:1,key='V1')) -# fwrite both logical args -test(2074.32, fwrite(DT, logical01=TRUE, logicalAsInt=TRUE), error="logicalAsInt has been renamed") +# 2074.32 tested that setting both logical01 and logicalAsInt errored; no longer relevant # merge.data.table test(2074.33, merge(DT, DT, by.x = 1i, by.y=1i), error="A non-empty vector of column names is required") @@ -17794,7 +17811,7 @@ if (base::getRversion() >= "3.4.0") { } test(2214.06, droplevels(DT)[["a"]], droplevels(DT[1:5,a])) test(2214.07, droplevels(DT, 1)[["a"]], x[1:5]) -test(2214.08, droplevels(DT, in.place=TRUE), DT, warning="droplevels() with in.place=TRUE is deprecated.") +test(2214.08, droplevels(DT, in.place=TRUE), error="droplevels() with in.place=TRUE is deprecated.") # support ordered factors in fdroplevels o = factor(letters[1:10], ordered=TRUE) test(2214.09, fdroplevels(o[1:5]), droplevels(o[1:5])) @@ -19227,16 +19244,29 @@ for (opt in c(0, 1, 2)) { # Confusing behavior with DT[, min(var):max(var)] #2069 DT = data.table(t = c(2L, 1L, 3L), a=0, b=1) -test(2284.1, DT[, min(t):max(t)], with(DT, min(t):max(t))) -test(2284.2, DT[, 1:max(t)], with(DT, 1:max(t))) -test(2284.3, DT[, max(t):1], with(DT, max(t):1)) -test(2284.4, DT[, 1:t[1L]], with(DT, 1:t[1L])) -test(2284.5, DT[, t[2L]:1], with(DT, t[2L]:1)) -test(2284.6, DT[, min(a):max(t)], with(DT, min(a):max(t))) +test(2284.01, DT[, min(t):max(t)], with(DT, min(t):max(t))) +test(2284.02, DT[, 1:max(t)], with(DT, 1:max(t))) +test(2284.03, DT[, max(t):1], with(DT, max(t):1)) +test(2284.04, DT[, 1:t[1L]], with(DT, 1:t[1L])) +test(2284.05, DT[, t[2L]:1], with(DT, t[2L]:1)) +test(2284.06, DT[, min(a):max(t)], with(DT, min(a):max(t))) # but not every `:` with a call in from or to is with=TRUE, #6486 -test(2284.7, DT[, 1:ncol(DT)], DT) -test(2284.8, DT[, 2:(ncol(DT) - 1L)], DT[, "a"]) -test(2284.9, DT[, (ncol(DT) - 1L):ncol(DT)], DT[, c("a", "b")]) +test(2284.07, DT[, 1:ncol(DT)], DT) +test(2284.08, DT[, 2:(ncol(DT) - 1L)], DT[, "a"]) +test(2284.09, DT[, (ncol(DT) - 1L):ncol(DT)], DT[, c("a", "b")]) +# literal:var is the same as literal:literal and var:var +test(2284.10, DT[, 1:a], DT[, t:a]) +test(2284.11, DT[, 1:a], DT[, 1:2]) +test(2284.12, DT[, t:2], DT[, t:a]) +test(2284.13, DT[, t:2], DT[, 1:2]) +test(2284.14, DT[, 1:b], DT[, t:b]) +test(2284.15, DT[, 1:b], DT[, 1:3]) +test(2284.16, DT[, t:3], DT[, t:b]) +test(2284.17, DT[, t:3], DT[, 1:3]) +test(2284.18, DT[, 2:b], DT[, a:b]) +test(2284.19, DT[, 2:b], DT[, 2:3]) +test(2284.20, DT[, a:3], DT[, a:b]) +test(2284.21, DT[, a:3], DT[, 2:3]) # Extra regression tests for #4772, which was already incidentally fixed by #5183. x = data.table(a=1:3) @@ -20627,6 +20657,18 @@ setDT(d2) test(2295.1, !is.data.table(d1)) test(2295.2, rownames(d1), 'b') test(2295.3, is.data.table(d2)) +# Ensure against regression noted in #6725 +x = data.frame(a=1) +e = environment() +foo = function(nm, env) { + setDT(get(nm, envir=env)) +} +foo('x', e) +test(2295.4, is.data.table(x)) +e = new.env(parent=topenv()) +e$x = data.frame(a=1) +foo('x', e) +test(2295.5, is.data.table(e$x)) # #6588: .checkTypos used to give arbitrary strings to stopf as the first argument test(2296, d2[x %no such operator% 1], error = '%no such operator%') @@ -20697,3 +20739,313 @@ if (test_bit64) { test(2300.3, DT1[DT2, on='id'], error="Incompatible join types") test(2300.4, DT2[DT1, on='id'], error="Incompatible join types") } + +# interpret more arguments to order() correctly when translating to forder(), #4456 +DT = data.table(a=rep(1:3, 4), b=rep(1:2, 6)) +test(2301.1, DT[order(a, method="auto")], error="no support for sorting by method='auto'") +test(2301.2, DT[order(a, b, decreasing=c(TRUE, FALSE))], DT[order(-a, b)]) +test(2301.3, DT[order(a, -b, decreasing=c(TRUE, TRUE))], error="Mixing '-' with vector decreasing") +test(2301.4, DT[order(a, b, decreasing=c(TRUE, TRUE, FALSE))], error="decreasing= has length 3") + +# as.data.table should remove extra attributes from extended data.frames #5699 +x = data.frame(a=c(1, 5, 3), b=c(2, 4, 6)) +class(x) = c("tbl", "data.frame") +attr(x, "t1") = "a" +as.data.frame.tbl = function(x) { + attr(x, "t1") = NULL + class(x) = "data.frame" + x +} +test(2302.1, attr(as.data.table(y), "t1"), attr(as.data.frame(y), "t1")) +class(x) = c("data.frame", "data.table", "data.frame") # #6739 +test(2302.2, attr(as.data.table(y), "t1"), attr(as.data.frame(y), "t1")) +class(x) = c("data.frame", "data.table") # duplicate class is overkill +test(2302.3, attr(as.data.table(y), "t1"), attr(as.data.frame(y), "t1")) + +# by=foo(KEY) does not retain key (no way to guarantee monotonic transformation), #5583 +DT = data.table(a=1:2, key='a') +test(2303.1, DT[, .N, by=.(b=rev(a))], data.table(b=2:1, N=1L)) +test(2303.2, DT[, .(N=1L), by=.(b=rev(a))], data.table(b=2:1, N=1L)) # ensure no interaction with GForce +DT = data.table(a=2:3, b=1:0, key=c('a', 'b')) +test(2303.3, DT[, .N, by=.(ab=a^b, d=c(1L, 1L))], data.table(ab=c(2, 1), d=1L, N=1L)) + +# tests for new consistent replacement of list columns with list(NULL), #5558 +# replacement of a list column with list(NULL) in a single-row data.table, using different assignment methods +DT = data.table(L=list("A"), i=1L) +ans = data.table(L=list(NULL), i=1L) +# test using replacement with $ operator +DT$L = list(NULL) +test(2304.001, DT, ans) +DT = data.table(L=list("A"), i=1L) +# standard form with := operator +test(2304.002, copy(DT)[, L := list(NULL)], ans) +# functional form with := operator +test(2304.003, copy(DT)[, `:=`(L=list(NULL))], ans) +# functional form with 'let' alias +test(2304.004, copy(DT)[, let(L=list(NULL))], ans) +# using set() +test(2304.005, set(copy(DT), j="L", value=list(NULL)), ans) + +# replacement of multiple list columns with list(NULL) in a single-row data.table, using different assignment methods +DT = data.table(L1=list("A"), L2=list("B"), i=1L) +ans = data.table(L1=list(NULL), L2=list(NULL), i=1L) +DT$L1 = list(NULL) +DT$L2 = list(NULL) +test(2304.006, DT, ans) +DT = data.table(L1=list("A"), L2=list("B"), i=1L) +# standard form with := operator +test(2304.007, copy(DT)[, c("L1", "L2") := list(list(NULL), list(NULL))], ans) +# functional form with := operator +test(2304.008, copy(DT)[, `:=`(L1=list(NULL), L2=list(NULL))], ans) +# functional form with 'let' alias +test(2304.009, copy(DT)[, let(L1=list(NULL), L2=list(NULL))], ans) +# using set() +test(2304.010, set(copy(DT), j=c("L1", "L2"), value=list(list(NULL), list(NULL))), ans) + +# replacement of a list column with list(NULL) in a multi-row data.table, using different assignment methods +DT = data.table(L=list("A", "B"), i=1L) +ans = data.table(L=list(NULL, NULL), i=1L) +# test using replacement with $ operator +DT$L = list(NULL) +test(2304.011, DT, ans) +DT = data.table(L=list("A", "B"), i=1L) +# standard form with := operator +test(2304.012, copy(DT)[, L := list(NULL)], ans) +# functional form with := operator +test(2304.013, copy(DT)[, `:=`(L=list(NULL))], ans) +# functional form with 'let' alias +test(2304.014, copy(DT)[, let(L=list(NULL))], ans) +# using set() +test(2304.015, set(copy(DT), j="L", value=list(NULL)), ans) + +# replacement of multiple list columns with list(NULL) in a multi-row data.table, using different assignment methods +DT = data.table(L1=list("A", "B"), L2=list("B", "C"), i=1L) +ans = data.table(L1=list(NULL, NULL), L2=list(NULL, NULL), i=1L) +DT$L1 = list(NULL) +DT$L2 = list(NULL) +test(2304.016, DT, ans) +DT = data.table(L1=list("A", "B"), L2=list("B", "C"), i=1L) +# standard form with := operator +test(2304.017, copy(DT)[, c("L1", "L2") := list(list(NULL), list(NULL))], ans) +# functional form with := operator +test(2304.018, copy(DT)[, `:=`(L1=list(NULL), L2=list(NULL))], ans) +# functional form with 'let' alias +test(2304.019, copy(DT)[, let(L1=list(NULL), L2=list(NULL))], ans) +# using set() +test(2304.020, set(copy(DT), j=c("L1", "L2"), value=list(list(NULL), list(NULL))), ans) + +# Adding an empty list column to a single-row data.table, using different assignment methods +DT = data.table(L=list("A"), i=1L) +ans = data.table(L=list("A"), i=1L, D=list(NULL)) +warn = "Tried to assign NULL to column 'D', but this column does not exist to remove" +# try to add a new empty list by list(NULL) with := in standard form, warns and does not change +test(2304.021, copy(DT)[, D := list(NULL)], DT, warning=warn) +test(2304.022, set(copy(DT), j="D", value=NULL), DT, warning=warn) +test(2304.023, set(copy(DT), j="D", value=list(NULL)), DT, warning=warn) +# add a new column by wrapping list(NULL), consistent with old behavior +DT$D = list(list(NULL)) +test(2304.024, DT, ans) +DT = data.table(L=list("A"), i=1L) +# test adding empty list column in standard form with := operator +test(2304.025, copy(DT)[, D := .(list(NULL))], ans) +# functional form with := operator +test(2304.026, copy(DT)[, `:=`(D=list(NULL))], ans) +# functional form with 'let' alias +test(2304.027, copy(DT)[, let(D=list(NULL))], ans) +# using set() +test(2304.028, set(copy(DT), j="D", value=list(list(NULL))), ans) + +# Adding multiple empty list columns to a single-row data.table, using different assignment methods +DT = data.table(L=list("A"), i=1L) +ans = data.table(L=list("A"), i=1L, D=list(NULL), R=list(NULL)) +DT$D = list(list(NULL)) +DT$R = list(list(NULL)) +test(2304.029, DT, ans) +DT = data.table(L=list("A"), i=1L) +# standard form with := operator +test(2304.030, copy(DT)[, c("D", "R") := .(list(NULL))], ans) +test(2304.031, copy(DT)[, c("D", "R") := .(list(NULL), list(NULL))], ans) +# functional form with := operator +test(2304.032, copy(DT)[, `:=`(D=list(NULL), R=list(NULL))], ans) +# functional form with 'let' alias +test(2304.033, copy(DT)[, let(D=list(NULL), R=list(NULL))], ans) +# using set() +test(2304.034, set(copy(DT), j=c("D", "R"), value=list(list(NULL))), ans) +test(2304.035, set(copy(DT), j=c("D", "R"), value=list(list(NULL), list(NULL))), ans) + +# Adding an empty list column to a multi-row data.table, using different assignment methods +DT = data.table(L=list("A", "B"), i=1L) +ans = data.table(L=list("A", "B"), i=1L, D=list(NULL, NULL)) +warn = "Tried to assign NULL to column 'D', but this column does not exist to remove" +# try to add a new empty list by list(NULL) with := in standard form, warns and does not change +test(2304.036, copy(DT)[, D := list(NULL)], DT, warning=warn) +test(2304.037, set(copy(DT), j="D", value=NULL), DT, warning=warn) +test(2304.038, set(copy(DT), j="D", value=list(NULL)), DT, warning=warn) +# add a new column by wrapping list(NULL), consistent with old behavior +DT$D = list(list(NULL)) +test(2304.039, DT, ans) +DT = data.table(L=list("A", "B"), i=1L) +# test adding empty list column in standard form with := operator +test(2304.040, copy(DT)[, D := .(list(NULL))], ans) +# functional form with := operator +test(2304.041, copy(DT)[, `:=`(D=list(NULL))], ans) +# functional form with 'let' alias +test(2304.042, copy(DT)[, let(D=list(NULL))], ans) +# using set() +test(2304.043, set(copy(DT), j="D", value = list(list(NULL))), ans) + +# Adding multiply empty list columns to a multi-row data.table, using different assignment methods +DT = data.table(L=list("A", "B"), i=1L) +ans = data.table(L=list("A", "B"), i=1L, D=list(NULL, NULL), R=list(NULL, NULL)) +DT$D = list(list(NULL)) +DT$R = list(list(NULL)) +test(2304.044, DT, ans) +DT = data.table(L=list("A", "B"), i=1L) +# standard form with := operator +test(2304.045, copy(DT)[, c("D", "R") := .(list(NULL))], ans) +test(2304.046, copy(DT)[, c("D", "R") := .(list(NULL), list(NULL))], ans) +# functional form with := operator +test(2304.047, copy(DT)[, `:=`(D=list(NULL), R=list(NULL))], ans) +# functional form with 'let' alias +test(2304.048, copy(DT)[, let(D=list(NULL), R=list(NULL))], ans) +# using set() +test(2304.049, set(copy(DT), j=c("D", "R"), value=list(list(NULL))), ans) +test(2304.050, set(copy(DT), j=c("D", "R"), value=list(list(NULL), list(NULL))), ans) + +# Removal of a list column in a single-row data.table, using different assignment methods +# NOTE: There is only one way to remove columns now, by assigning to NULL +DT = data.table(L=list("A"), i=1L) +ans = data.table(i=1L) +# test removing a list column by assigning to NULL +DT$L = NULL +test(2304.051, DT, ans) +DT = data.table(L=list("A"), i=1L) +# standard form with := operator +test(2304.052, copy(DT)[, L := NULL], ans) +# functional form with := operator +test(2304.053, copy(DT)[, `:=`(L=NULL)], ans) +# functional form with 'let' alias +test(2304.054, copy(DT)[, let(L=NULL)], ans) +# using set() +test(2304.055, set(copy(DT), j="L", value=NULL), ans) + +# Removal of multiple list columns in a single-row data.table, using different assignment methods +DT = data.table(L1=list("A"), L2=list("B"), i=1L) +# test removing two list columns by assigning to NULL +DT$L1 = NULL +DT$L2 = NULL +test(2304.056, DT, ans) +DT = data.table(L1=list("A"), L2=list("B"), i=1L) +# standard form with := operator +test(2304.057, copy(DT)[, c("L1", "L2") := NULL], ans) +test(2304.058, copy(DT)[, c("L1", "L2") := .(NULL, NULL)], ans) +# functional form with := operator +test(2304.059, copy(DT)[, `:=`(L1=NULL, L2=NULL)], ans) +# functional form with 'let' alias +test(2304.060, copy(DT)[, let(L1=NULL, L2=NULL)], ans) +# using set() +test(2304.061, set(copy(DT), j=c("L1", "L2"), value=NULL), ans) +test(2304.062, set(copy(DT), j=c("L1", "L2"), value=list(NULL, NULL)), ans) + +# Removal of a list column in a multi-row data.table, using different assignment methods +DT = data.table(L=list("A", "B"), i=1L) +ans = data.table(i=c(1L, 1L)) +# test removing a list column by assigning to NULL +DT$L = NULL +test(2304.063, DT, ans) +DT = data.table(L=list("A", "B"), i=1L) +# standard form with := operator +test(2304.064, copy(DT)[, L := NULL], ans) +# functional form with := operator +test(2304.065, copy(DT)[, `:=`(L=NULL)], ans) +# functional form with 'let' alias +test(2304.066, copy(DT)[, let(L=NULL)], ans) +# using set() +test(2304.067, set(copy(DT), j="L", value=NULL), ans) + +# Removal of multiple list columns in a multi-row data.table, using different assignment methods +DT = data.table(L1=list("A", "B"), L2=list("B", "C"), i=1L) +# test removing two list columns by assigning to NULL +DT$L1 = NULL +DT$L2 = NULL +test(2304.068, DT, ans) +DT = data.table(L1=list("A", "B"), L2=list("B", "C"), i=1L) +# standard form with := operator +test(2304.069, copy(DT)[, c("L1", "L2") := NULL], ans) +test(2304.070, copy(DT)[, c("L1", "L2") := .(NULL, NULL)], ans) +# functional form with := operator +test(2304.071, copy(DT)[, `:=`(L1=NULL, L2=NULL)], ans) +# functional form with 'let' alias +test(2304.072, copy(DT)[, let(L1=NULL, L2=NULL)], ans) +# using set() +test(2304.073, set(copy(DT), j=c("L1", "L2"), value=NULL), ans) +test(2304.074, set(copy(DT), j=c("L1", "L2"), value=list(NULL, NULL)), ans) + +# Combining queries (add/remove/replace columns in the same query) for a single-row data.table + +# test for adding a new empty list column D and removing column L in the same query +DT = data.table(L=list("A"), i=1L) +ans = data.table(i=1L, D=list(NULL)) +test(2304.075, copy(DT)[, c("L", "D") := list(NULL, list(NULL))], ans) +test(2304.076, copy(DT)[, `:=`(L=NULL, D=list(NULL))], ans) +test(2304.077, copy(DT)[, let(L=NULL, D=list(NULL))], ans) +test(2304.078, set(copy(DT), j=c("L", "D"), value=list(NULL, list(NULL))), ans) + +# test for adding a new empty list column D and replacing column L with empty list in the same query +DT = data.table(L=list("A"), i=1L) +ans = data.table(L=list(NULL), i=1L, D=list(NULL)) +test(2304.079, copy(DT)[, c("L", "D") := list(list(NULL), list(NULL))], ans) +test(2304.080, copy(DT)[, `:=`(L=list(NULL), D=list(NULL))], ans) +test(2304.081, copy(DT)[, let(L=list(NULL), D=list(NULL))], ans) +test(2304.082, set(copy(DT), j=c("L", "D"), value=list(list(NULL), list(NULL))), ans) + +# test for replacing column L with an empty list and removing list column D in the same query +DT = data.table(L=list("A"), D=list("B"), i=1L) +ans = data.table(L=list(NULL), i=1L) +test(2304.083, copy(DT)[, c("L", "D") := list(list(NULL), NULL)], ans) +test(2304.084, copy(DT)[, `:=`(L=list(NULL), D=NULL)], ans) +test(2304.085, copy(DT)[, let(L=list(NULL), D=NULL)], ans) +test(2304.086, set(copy(DT), j=c("L", "D"), value=list(list(NULL), NULL)), ans) + +# test for combining add, replace, remove in the same query +DT = data.table(L=list("A"), D=list("B"), i=1L) +ans = data.table(L=list(NULL), i=1L, E=list(NULL)) +test(2304.087, copy(DT)[, c("L", "D", "E") := list(list(NULL), NULL, list(NULL))], ans) +test(2304.088, copy(DT)[, `:=`(L=list(NULL), D=NULL, E=list(NULL))], ans) +test(2304.089, copy(DT)[, let(L=list(NULL), D=NULL, E=list(NULL))], ans) +test(2304.090, set(copy(DT), j=c("L", "D", "E"), value=list(list(NULL), NULL, list(NULL))), ans) + +# sub-assignment of list column with list(NULL) in a multi-row data.table, using different assignment methods +DT = data.table(L=list("A", "B"), i=1L) +ans = data.table(L=list("A", NULL), i=1L) +# test using replacement with $ operator +DT$L[2L] = list(NULL) +test(2304.091, DT, ans) +DT = data.table(L=list("A", "B"), i=1L) +# standard form with := operator +test(2304.092, copy(DT)[2L, L := list(list(NULL))], ans) +# functional form with := operator +test(2304.093, copy(DT)[2L, `:=`(L=list(NULL))], ans) +# functional form with 'let' alias +test(2304.094, copy(DT)[2L, let(L=list(NULL))], ans) +# using set() +test(2304.095, set(copy(DT), i=2L, j="L", value=list(list(NULL))), ans) + +# sub-assignment of multiple list columns with list(NULL) in a multi-row data.table, using different assignment methods +DT = data.table(L1=list("A", "B"), L2=list("B", "C"), i=1L) +ans = data.table(L1=list("A", NULL), L2=list("B", NULL), i=1L) +DT$L1[2L] = list(NULL) +DT$L2[2L] = list(NULL) +test(2304.096, DT, ans) +DT = data.table(L1=list("A", "B"), L2=list("B", "C"), i=1L) +# standard form with := operator +test(2304.097, copy(DT)[2L, c("L1", "L2") := list(list(NULL), list(NULL))], ans) +# functional form with := operator +test(2304.098, copy(DT)[2L, `:=`(L1=list(NULL), L2=list(NULL))], ans) +# functional form with 'let' alias +test(2304.099, copy(DT)[2L, let(L1=list(NULL), L2=list(NULL))], ans) +# using set() +test(2304.100, set(copy(DT), i=2L, j=c("L1", "L2"), value=list(list(NULL), list(NULL))), ans) + +# the integer overflow in #6729 is only noticeable with UBSan +test(2305, { fread(testDir("issue_6729.txt.bz2")); TRUE }) diff --git a/man/assign.Rd b/man/assign.Rd index 71e9542302..e1d4751f3c 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -78,6 +78,8 @@ When \code{LHS} is a factor column and \code{RHS} is a character vector with ite Unlike \code{<-} for \code{data.frame}, the (potentially large) LHS is not coerced to match the type of the (often small) RHS. Instead the RHS is coerced to match the type of the LHS, if necessary. Where this involves double precision values being coerced to an integer column, a warning is given when fractional data is truncated. It is best to get the column types correct up front and stick to them. Changing a column type is possible but deliberately harder: provide a whole column as the RHS. This RHS is then \emph{plonked} into that column slot and we call this \emph{plonk syntax}, or \emph{replace column syntax} if you prefer. By needing to construct a full length vector of a new type, you as the user are more aware of what is happening and it is clearer to readers of your code that you really do intend to change the column type; e.g., \code{DT[, colA:=as.integer(colA)]}. A plonk occurs whenever you provide a RHS value to `:=` which is \code{nrow} long. When a column is \emph{plonked}, the original column is not updated by reference because that would entail updating every single element of that column whereas the plonk is just one column pointer update. \code{data.table}s are \emph{not} copied-on-change by \code{:=}, \code{setkey} or any of the other \code{set*} functions. See \code{\link{copy}}. + +While in most cases standard and functional form of \code{:=} are interchangeable, there are some minor differences in the way that \code{RHS} is handled. In the functional form, \code{:=} operator behaves like an alias to \code{list}. This means that when \code{RHS} is a list, \code{LHS} is assigned a list. Avoid this by using the standard form when \code{RHS} is a list, or use a vector. See \href{../doc/datatable-reference-semantics.html}{\code{vignette("datatable-reference-semantics")}} for examples. } \section{Advanced (internals):}{It is easy to see how \emph{sub-assigning} to existing columns is done internally. Removing columns by reference is also straightforward by modifying the vector of column pointers only (using memmove in C). However adding (new) columns is more tricky as to how the \code{data.table} can be grown \emph{by reference}: the list vector of column pointers is \emph{over-allocated}, see \code{\link{truelength}}. By defining \code{:=} in \code{j} we believe update syntax is natural, and scales, but it also bypasses \code{[<-} dispatch and allows \code{:=} to update by reference with no copies of any part of memory at all. diff --git a/man/deprecated.Rd b/man/deprecated.Rd deleted file mode 100644 index da138d8734..0000000000 --- a/man/deprecated.Rd +++ /dev/null @@ -1,16 +0,0 @@ -\name{key<-} -\alias{key<-} -\title{ Deprecated. } -\keyword{internal} -\description{ - This function is deprecated. It will be removed in future. Please use \code{\link{setkey}}. -} -\usage{ -key(x) <- value # warning since 2012; DEPRECATED since Mar 2019 -} -\examples{ -# dummy example section to pass release check that all .Rd files have examples -} -\arguments{ -\item{x}{ Deprecated. } -} diff --git a/man/fdroplevels.Rd b/man/fdroplevels.Rd index 724399d20a..4a7423a44c 100644 --- a/man/fdroplevels.Rd +++ b/man/fdroplevels.Rd @@ -12,13 +12,13 @@ fdroplevels(x, exclude = if (anyNA(levels(x))) NULL else NA, \dots) setdroplevels(x, except = NULL, exclude = NULL) -\method{droplevels}{data.table}(x, except = NULL, exclude, in.place = FALSE, \dots) +\method{droplevels}{data.table}(x, except = NULL, exclude, in.place = NULL, \dots) } \arguments{ \item{x}{ \code{factor} or \code{data.table} where unused levels should be dropped. } \item{exclude}{ A \code{character} vector of factor levels which are dropped no matter of presented or not. } \item{except}{ An \code{integer} vector of indices of data.table columns which are not modified by dropping levels. } - \item{in.place}{ logical (default is \code{FALSE}). If \code{TRUE} levels of factors of \code{data.table} are modified in-place. } + \item{in.place}{ Deprecated. Use \code{setdroplevels} for in-place modification. } \item{\dots}{ further arguments passed to methods } } diff --git a/man/fread.Rd b/man/fread.Rd index 00a24dab3a..27e840ccd9 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -22,10 +22,10 @@ key=NULL, index=NULL, showProgress=getOption("datatable.showProgress", interactive()), data.table=getOption("datatable.fread.datatable", TRUE), nThread=getDTthreads(verbose), -logical01=getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS +logical01=getOption("datatable.logical01", FALSE), logicalYN=getOption("datatable.logicalYN", FALSE), keepLeadingZeros = getOption("datatable.keepLeadingZeros", FALSE), -yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" +yaml=FALSE, autostart=NULL, tmpdir=tempdir(), tz="UTC" ) } \arguments{ @@ -65,7 +65,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{logicalYN}{If TRUE a column containing only Ys and Ns will be read as logical, otherwise as character.} \item{keepLeadingZeros}{If TRUE a column containing numeric data with leading zeros will be read as character, otherwise leading zeros will be removed and converted to numeric.} \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. } - \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. } + \item{autostart}{ Deprecated. Please use \code{skip} instead. } \item{tmpdir}{ Directory to use as the \code{tmpdir} argument for any \code{tempfile} calls, e.g. when the input is a URL or a shell command. The default is \code{tempdir()} which can be controlled by setting \code{TMPDIR} before starting the R session; see \code{\link[base:tempfile]{base::tempdir}}. } \item{tz}{ Relevant to datetime values which have no Z or UTC-offset at the end, i.e. \emph{unmarked} datetime, as written by \code{\link[utils:write.table]{utils::write.csv}}. The default \code{tz="UTC"} reads unmarked datetime as UTC POSIXct efficiently. \code{tz=""} reads unmarked datetime as type character (slowly) so that \code{as.POSIXct} can interpret (slowly) the character datetimes in local timezone; e.g. by using \code{"POSIXct"} in \code{colClasses=}. Note that \code{fwrite()} by default writes datetime in UTC including the final Z and therefore \code{fwrite}'s output will be read by \code{fread} consistently and quickly without needing to use \code{tz=} or \code{colClasses=}. If the \code{TZ} environment variable is set to \code{"UTC"} (or \code{""} on non-Windows where unset vs `""` is significant) then the R session's timezone is already UTC and \code{tz=""} will result in unmarked datetimes being read as UTC POSIXct. For more information, please see the news items from v1.13.0 and v1.14.0. } } diff --git a/man/fwrite.Rd b/man/fwrite.Rd index efa830fb9d..efe6582200 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -12,12 +12,13 @@ fwrite(x, file = "", append = FALSE, quote = "auto", na = "", dec = ".", row.names = FALSE, col.names = TRUE, qmethod = c("double","escape"), logical01 = getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS - logicalAsInt = logical01, # deprecated + logicalAsInt = NULL, # deprecated scipen = getOption('scipen', 0L), dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB = 8L, nThread = getDTthreads(verbose), showProgress = getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), + compressLevel = 6L, yaml = FALSE, bom = FALSE, verbose = getOption("datatable.verbose", FALSE), @@ -58,6 +59,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.} + \item{compressLevel}{Level of compression between 1 and 9, 6 by default. See \url{https://linux.die.net/man/1/gzip} for details.} \item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. See \code{Details}. } \item{bom}{If \code{TRUE} a BOM (Byte Order Mark) sequence (EF BB BF) is added at the beginning of the file; format 'UTF-8 with BOM'.} \item{verbose}{Be chatty and report timings?} diff --git a/man/setDT.Rd b/man/setDT.Rd index 9311d0e3b1..5fc2789e29 100644 --- a/man/setDT.Rd +++ b/man/setDT.Rd @@ -18,21 +18,25 @@ setDT(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) } \details{ - When working on large \code{lists} or \code{data.frames}, it might be both time and memory consuming to convert them to a \code{data.table} using \code{as.data.table(.)}, as this will make a complete copy of the input object before to convert it to a \code{data.table}. The \code{setDT} function takes care of this issue by allowing to convert \code{lists} - both named and unnamed lists and \code{data.frames} \emph{by reference} instead. That is, the input object is modified in place, no copy is being made. + When working on large \code{list}s or \code{data.frame}s, it might be both time- and memory-consuming to convert them to a \code{data.table} using \code{as.data.table(.)}, which will make a complete copy of the input object before converting it to a \code{data.table}. \code{setDT} takes care of this issue by converting any \code{list} (named or unnamed, data.frame or not) \emph{by reference} instead. That is, the input object is modified in place with no copy. + + This should come with low overhead, but note that \code{setDT} does check that the input is valid by looking for inconsistent input lengths and inadmissible column types (e.g. matrix). } \value{ - The input is modified by reference, and returned (invisibly) so it can be used in compound statements; e.g., \code{setDT(X)[, sum(B), by=A]}. If you require a copy, take a copy first (using \code{DT2 = copy(DT)}). See \code{?copy}. + The input is modified by reference, and returned (invisibly) so it can be used in compound statements; e.g., \code{setDT(X)[, sum(B), by=A]}. If you require a copy, take a copy first (using \code{DT2 = copy(DT)}). See \code{?copy}. } -\seealso{ \code{\link{data.table}}, \code{\link{as.data.table}}, \code{\link{setDF}}, \code{\link{copy}}, \code{\link{setkey}}, \code{\link{setcolorder}}, \code{\link{setattr}}, \code{\link{setnames}}, \code{\link{set}}, \code{\link{:=}}, \code{\link{setorder}} +\seealso{ + \code{\link{data.table}}, \code{\link{as.data.table}}, \code{\link{setDF}}, \code{\link{copy}}, \code{\link{setkey}}, \code{\link{setcolorder}}, \code{\link{setattr}}, \code{\link{setnames}}, \code{\link{set}}, \code{\link{:=}}, \code{\link{setorder}} } \examples{ set.seed(45L) -X = data.frame(A=sample(3, 10, TRUE), - B=sample(letters[1:3], 10, TRUE), - C=sample(10), stringsAsFactors=FALSE) +X = data.frame( + A=sample(3, 10, TRUE), + B=sample(letters[1:3], 10, TRUE), + C=sample(10)) # Convert X to data.table by reference and # get the frequency of each "A,B" combination diff --git a/man/setkey.Rd b/man/setkey.Rd index 378f1a106a..b8c9b1ce8f 100644 --- a/man/setkey.Rd +++ b/man/setkey.Rd @@ -142,6 +142,11 @@ setindex(DT, B) indices(DT) # get indices single vector indices(DT, vectors = TRUE) # get indices list +# Setting multiple indices at once +DT = data.table(A = 5:1, B = letters[5:1], C = 10:6) +setindexv(DT, list(c("A", "B"), c("B", "C"))) +print(DT, show.indices=TRUE) + # Use the dot .(subset_value) syntax with integer keys: DT = data.table(id = 2:1) setkey(DT, id) diff --git a/src/assign.c b/src/assign.c index b280c2259f..ae256f7774 100644 --- a/src/assign.c +++ b/src/assign.c @@ -33,7 +33,7 @@ void setselfref(SEXP x) { R_NilValue, // for identical() to return TRUE. identical() doesn't look at tag and prot PROTECT(getAttrib(x, R_NamesSymbol)), // to detect if names has been replaced and its tl lost, e.g. setattr(DT,"names",...) PROTECT(R_MakeExternalPtr( // to avoid an infinite loop in object.size(), if prot=x here - x, // to know if this data.table has been copied by key<-, attr<-, names<-, etc. + x, // to know if this data.table has been copied by attr<-, names<-, etc. R_NilValue, // this tag and prot currently unused R_NilValue )) @@ -549,6 +549,11 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) for (int i=0; i 1; // showProgress only if more than 1 group + double startTime = (showProgress) ? wallclock() : 0; // For progress printing, startTime is set at the beginning + double nextTime = (showProgress) ? startTime+3 : 0; // wait 3 seconds before printing progress defineVar(sym_BY, BY = PROTECT(allocVector(VECSXP, ngrpcols)), env); nprotect++; // PROTECT for rchk SEXP bynames = PROTECT(allocVector(STRSXP, ngrpcols)); nprotect++; // TO DO: do we really need bynames, can we assign names afterwards in one step? diff --git a/src/fread.c b/src/fread.c index a3346bba20..6d4e3e51f0 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1971,7 +1971,7 @@ int freadMain(freadMainArgs _args) { int thisLineLen = (int)(ch-lineStart); // ch is now on start of next line so this includes line ending already sampleLines++; sumLen += thisLineLen; - sumLenSq += thisLineLen*thisLineLen; + sumLenSq += (double)thisLineLen*thisLineLen; // avoid integer overflow, #6729 if (thisLineLenmaxLen) maxLen=thisLineLen; if (jump==0 && bumped) { diff --git a/src/fwrite.c b/src/fwrite.c index e6d149750e..4496f08130 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -26,12 +26,10 @@ #include "fwriteLookups.h" #include "fwrite.h" +#define MEGA (1 << 20) #define NUM_SF 15 #define SIZE_SF 1000000000000000ULL // 10^NUM_SF -#define MIN(a,b) (((a)<(b))?(a):(b)) -#define MAX(a,b) (((a)>(b))?(a):(b)) - // Globals for this file only. Written once to hold parameters passed from R level. static const char *na; // by default "" or if set (not recommended) then usually "NA" static char sep; // comma in .csv files @@ -43,6 +41,7 @@ static bool qmethodEscape=false; // when quoting fields, how to escape dou static int scipen; static bool squashDateTime=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) static bool verbose=false; +static int gzip_level; extern const char *getString(const void *, int64_t); extern int getStringLen(const void *, int64_t); @@ -56,7 +55,8 @@ inline void write_chars(const char *x, char **pch) { // similar to C's strcpy but i) doesn't include trailing \0 and ii) moves destination along char *ch = *pch; - while (*x) *ch++=*x++; + while (*x) + *ch++ = *x++; *pch = ch; } @@ -113,7 +113,7 @@ void writeInt32(const void *col, int64_t row, char **pch) if (x == INT32_MIN) { write_chars(na, &ch); } else { - if (x<0) { *ch++ = '-'; x=-x; } + if (x<0) { *ch++ = '-'; x = -x; } // Avoid log() for speed. Write backwards then reverse when we know how long. char *low = ch; do { *ch++ = '0'+x%10; x/=10; } while (x>0); @@ -129,7 +129,7 @@ void writeInt64(const void *col, int64_t row, char **pch) if (x == INT64_MIN) { write_chars(na, &ch); } else { - if (x<0) { *ch++ = '-'; x=-x; } + if (x<0) { *ch++ = '-'; x = -x; } char *low = ch; do { *ch++ = '0'+x%10; x/=10; } while (x>0); reverse(ch, low); @@ -252,7 +252,7 @@ void writeFloat64(const void *col, int64_t row, char **pch) int dr = sf-exp-1; // how many characters to print to the right of the decimal place int width=0; // field width were it written decimal format. Used to decide whether to or not. int dl0=0; // how many 0's to add to the left of the decimal place before starting l - if (dr<=0) { dl0=-dr; dr=0; width=sf+dl0; } // 1, 10, 100, 99000 + if (dr<=0) { dl0 = -dr; dr=0; width=sf+dl0; } // 1, 10, 100, 99000 else { if (sf>dr) width=sf+1; // 1.234 and 123.4 else { dl0=1; width=dr+1+dl0; } // 0.1234, 0.0001234 @@ -285,7 +285,7 @@ void writeFloat64(const void *col, int64_t row, char **pch) *ch = '0' + l; ch += sf + (sf>1); *ch++ = 'e'; // lower case e to match base::write.csv - if (exp < 0) { *ch++ = '-'; exp=-exp; } + if (exp < 0) { *ch++ = '-'; exp = -exp; } else { *ch++ = '+'; } // to match base::write.csv if (exp < 100) { *ch++ = '0' + (exp / 10); @@ -562,14 +562,18 @@ void writeCategString(const void *col, int64_t row, char **pch) #ifndef NOZLIB int init_stream(z_stream *stream) { - memset(stream, 0, sizeof(z_stream)); // shouldn't be needed, done as part of #4099 to be sure stream->next_in = Z_NULL; stream->zalloc = Z_NULL; stream->zfree = Z_NULL; stream->opaque = Z_NULL; - - // 31 comes from : windows bits 15 | 16 gzip format - int err = deflateInit2(stream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); + // In deflateInit2, windowBits can be –8..–15 for raw deflate, 15 is the bigger window size. + // With -15, deflate() will generate raw deflate data with no zlib header or trailer, and will not compute a check value. + // Previously this parameter was 31 because windowBits can be greater than 15 for optional gzip encoding. + // Adding 16 writes a simple gzip header and trailer around the compressed data. + // Now we manage header and trailer. gzip file is slighty lower with -15 because no header/trailer are + // written for each chunk. + // For memLevel, 8 is the default value (128 KiB). memLevel=9 uses maximum memory for optimal speed. To be tested ? + int err = deflateInit2(stream, gzip_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); return err; // # nocov } @@ -579,18 +583,17 @@ int compressbuff(z_stream *stream, void* dest, size_t *destLen, const void* sour stream->avail_out = *destLen; stream->next_in = (Bytef *)source; // don't use z_const anywhere; #3939 stream->avail_in = sourceLen; - int err = deflate(stream, Z_FINISH); - if (err == Z_OK) { - // with Z_FINISH, deflate must return Z_STREAM_END if correct, otherwise it's an error and we shouldn't return Z_OK (0) - err = -9; // # nocov - } - *destLen = stream->total_out; - return err == Z_STREAM_END ? Z_OK : err; + int err = deflate(stream, Z_SYNC_FLUSH); + *destLen = *destLen - stream->avail_out; + // *destLen = stream->total_out; + return (err != Z_STREAM_ERROR) ? Z_OK : err; } #endif /* - OpenMP is used here primarily to parallelize the process of writing rows + main fwrite function ---- + +OpenMP is used here primarily to parallelize the process of writing rows to the output file, but error handling and compression (if enabled) are also managed within the parallel region. Special attention is paid to thread safety and synchronization, especially in the ordered sections @@ -611,22 +614,31 @@ void fwriteMain(fwriteMainArgs args) doQuote = args.doQuote; int8_t quoteHeaders = args.doQuote; verbose = args.verbose; + gzip_level = args.gzip_level; + + size_t len; + unsigned int crc; + + // exit if compression is needed in param and no zlib +#ifdef NOZLIB + if (args.is_gzip) + STOP(_("Compression in fwrite uses zlib library. Its header files were not found at the time data.table was compiled. To enable fwrite compression, please reinstall data.table and study the output for further guidance.")); // # nocov +#endif // When NA is a non-empty string, then we must quote all string fields in case they contain the na string // na is recommended to be empty, though - if (na[0]!='\0' && doQuote==INT8_MIN) doQuote = true; + if (na[0]!='\0' && doQuote==INT8_MIN) + doQuote = true; qmethodEscape = args.qmethodEscape; squashDateTime = args.squashDateTime; - if (args.buffMB<1 || args.buffMB>1024) STOP(_("buffMB=%d outside [1,1024]"), args.buffMB); - size_t buffSize = (size_t)1024*1024*args.buffMB; - int eolLen=strlen(args.eol), naLen=strlen(args.na); // Aside: codacy wants strnlen but strnlen is not in C99 (neither is strlen_s). To pass `gcc -std=c99 -Wall -pedantic` // we'd need `#define _POSIX_C_SOURCE 200809L` before #include but that seems a step too far // and platform specific. We prefer to be pure C99. - if (eolLen<=0) STOP(_("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d"), eolLen); + if (eolLen <= 0) + STOP(_("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d"), eolLen); if (verbose) { DTPRINT(_("Column writers: ")); @@ -642,6 +654,8 @@ void fwriteMain(fwriteMainArgs args) // # notranslate end } + // Calc maxLineLen + // // Calculate upper bound for line length. Numbers use a fixed maximum (e.g. 12 for integer) while strings find the longest // string in each column. Upper bound is then the sum of the columns' max widths. // This upper bound is required to determine a reasonable rowsPerBatch. It also saves needing to grow the buffers which @@ -652,10 +666,10 @@ void fwriteMain(fwriteMainArgs args) // could be console output) and writing column names to it. double t0 = wallclock(); - size_t maxLineLen = eolLen + args.ncol*(2*(doQuote!=0) + sepLen); + size_t maxLineLen = eolLen + args.ncol * (2*(doQuote!=0) + sepLen); if (args.doRowNames) { - maxLineLen += args.rowNames==NULL ? 1+(int)log10(args.nrow) // the width of the row number - : (args.rowNameFun==WF_String ? getMaxStringLen(args.rowNames, args.nrow)*2 // *2 in case longest row name is all quotes (!) and all get escaped + maxLineLen += args.rowNames==NULL ? 1 + (int)log10(args.nrow) // the width of the row number + : (args.rowNameFun==WF_String ? getMaxStringLen(args.rowNames, args.nrow) * 2 // *2 in case longest row name is all quotes (!) and all get escaped : 11); // specific integer names could be MAX_INT 2147483647 (10 chars) even on a 5 row table, and data.frame allows negative integer rownames hence 11 for the sign maxLineLen += 2/*possible quotes*/ + sepLen; } @@ -673,20 +687,24 @@ void fwriteMain(fwriteMainArgs args) width = getMaxListItemLen(args.columns[j], args.nrow); break; default: // # nocov - INTERNAL_STOP("type %d has no max length method implemented", args.whichFun[j]); // # nocov + INTERNAL_STOP(_("type %d has no max length method implemented"), args.whichFun[j]); // # nocov } } - if (args.whichFun[j]==WF_Float64 && args.scipen>0) width+=MIN(args.scipen,350); // clamp width to IEEE754 max to avoid scipen=99999 allocating buffer larger than can ever be written - if (width 0) + // clamp width to IEEE754 max to avoid scipen=99999 allocating buffer larger than can ever be written + width += (args.scipen < 350) ? args.scipen : 350; + // ensure that NA string can be written + if (width < naLen) + width = naLen; + maxLineLen += width * 2; // *2 in case the longest string is all quotes and they all need to be escaped } - if (verbose) DTPRINT(_("maxLineLen=%"PRIu64". Found in %.3fs\n"), (uint64_t)maxLineLen, 1.0*(wallclock()-t0)); + if (verbose) + DTPRINT(_("maxLineLen=%"PRIu64". Found in %.3fs\n"), (uint64_t)maxLineLen, 1.0*(wallclock()-t0)); - int f=0; + int f = 0; if (*args.filename=='\0') { - f=-1; // file="" means write to standard output + f = -1; // file="" means write to standard output args.is_gzip = false; // gzip is only for file - // eol = "\n"; // We'll use DTPRINT which converts \n to \r\n inside it on Windows } else { #ifdef WIN32 f = _open(args.filename, _O_WRONLY | _O_BINARY | _O_CREAT | (args.append ? _O_APPEND : _O_TRUNC), _S_IWRITE); @@ -706,42 +724,143 @@ void fwriteMain(fwriteMainArgs args) // # nocov end } } -#ifdef NOZLIB - if (args.is_gzip) - STOP(_("Compression in fwrite uses zlib library. Its header files were not found at the time data.table was compiled. To enable fwrite compression, please reinstall data.table and study the output for further guidance.")); // # nocov -#endif int yamlLen = strlen(args.yaml); if (verbose) { - DTPRINT(_("Writing bom (%s), yaml (%d characters) and column names (%s) ... "), - args.bom?"true":"false", yamlLen, args.colNames?"true":"false"); - if (f==-1) DTPRINT(_("\n")); + DTPRINT(_("Writing bom (%s), yaml (%d characters) and column names (%s)\n"), + args.bom ? "true" : "false", yamlLen, args.colNames ? "true" : "false"); + if (f == -1) + DTPRINT(_("\n")); } + + // Calc headerLen + size_t headerLen = 0; - if (args.bom) headerLen += 3; + if (args.bom) + headerLen += 3; headerLen += yamlLen; if (args.colNames) { - for (int j=0; j> column name) + for (int j=0; j> column name) + } + + // Create heap zones ---- + + int nth = args.nth; + + // Calc buffSize + size_t buffSize = args.buffMB * MEGA; + + // Decide buffer size and rowsPerBatch for each thread + // Once rowsPerBatch is decided it can't be changed + + // if maxLineLen is greater than buffSize, increase buffSize + if (buffSize < maxLineLen) { + buffSize = maxLineLen; + } + // ensure buffer can take header line + if (nth * buffSize < headerLen) { + buffSize = headerLen / nth + 1; + } + + int rowsPerBatch = buffSize / maxLineLen; + // + 1 because of the last incomplete loop + int numBatches = args.nrow / rowsPerBatch + 1; + + // force 1 batch, and then 1 thread, if number of lines in table < rowsPerBatch + if (args.nrow < rowsPerBatch) { + rowsPerBatch = args.nrow; + numBatches = 1; + } + + // avoid useless threads, and then too big memory allocations + if (numBatches < nth) + nth = numBatches; + + if (verbose) { + DTPRINT(_("Writing %"PRId64" rows in %d batches of %d rows, each buffer size %zu bytes (%zu MiB), showProgress=%d, nth=%d\n"), + args.nrow, numBatches, rowsPerBatch, buffSize, buffSize / MEGA, args.showProgress, nth); + } + + // alloc nth write buffers + errno = 0; + size_t alloc_size = nth * buffSize; + if (verbose) { + DTPRINT(_("Allocate %zu bytes (%zu MiB) for buffPool\n"), alloc_size, alloc_size / MEGA); + } + char *buffPool = malloc(alloc_size); + if (!buffPool) { + STOP(_("Unable to allocate %zu MB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), // # nocov + buffSize / MEGA, nth, errno, strerror(errno)); // # nocov } + + // init compress variables +#ifndef NOZLIB + z_stream strm; + // NB: fine to free() this even if unallocated + char *zbuffPool = NULL; + size_t zbuffSize = 0; + size_t compress_len = 0; + if (args.is_gzip) { + // compute zbuffSize which is the same for each thread + if (init_stream(&strm) != Z_OK) { + // # nocov start + free(buffPool); + STOP(_("Can't init stream structure for deflateBound")); + // # nocov end + } + zbuffSize = deflateBound(&strm, buffSize); + if (verbose) + DTPRINT(_("zbuffSize=%d returned from deflateBound\n"), (int)zbuffSize); + + // alloc nth zlib buffers + // if headerLen > nth * zbuffSize (long variable names and 1 thread), alloc headerLen + alloc_size = nth * zbuffSize < headerLen ? headerLen : nth * zbuffSize; + if (verbose) { + DTPRINT(_("Allocate %zu bytes (%zu MiB) for zbuffPool\n"), alloc_size, alloc_size / MEGA); + } + zbuffPool = malloc(alloc_size); + if (!zbuffPool) { + // # nocov start + free(buffPool); + deflateEnd(&strm); + STOP(_("Unable to allocate %zu MiB * %d thread compressed buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), + zbuffSize / MEGA, nth, errno, strerror(errno)); + // # nocov end + } + } + +#endif + + // write header + + // use buffPool and zbuffPool because we ensure that allocation is minimum headerLen + if (headerLen) { - char *buff = malloc(headerLen); - if (!buff) - STOP(_("Unable to allocate %zu MiB for header: %s"), headerLen / 1024 / 1024, strerror(errno)); // # nocov + char *buff = buffPool; char *ch = buff; - if (args.bom) {*ch++=(char)0xEF; *ch++=(char)0xBB; *ch++=(char)0xBF; } // 3 appears above (search for "bom") + if (args.bom) { + *ch++=(char)0xEF; + *ch++=(char)0xBB; + *ch++=(char)0xBF; + } // 3 appears above (search for "bom") memcpy(ch, args.yaml, yamlLen); ch += yamlLen; if (args.colNames) { if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column - if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv + if (doQuote !=0) { + // to match write.csv + *ch++='"'; + *ch++='"'; + } *ch = sep; ch += sepLen; } int8_t tempDoQuote = doQuote; doQuote = quoteHeaders; // temporary overwrite since headers might get different quoting behavior, #2964 - for (int j=0; j buffSize ? headerLen : buffSize); - char *zbuff = malloc(zbuffSize); - if (!zbuff) { - free(buff); // # nocov - STOP(_("Unable to allocate %zu MiB for zbuffer: %s"), zbuffSize / 1024 / 1024, strerror(errno)); // # nocov - } + if (args.is_gzip) { + char* zbuff = zbuffPool; + // write minimal gzip header + char* header = "\037\213\10\0\0\0\0\0\0\3"; + ret0 = WRITE(f, header, 10); + compress_len += 10; + crc = crc32(0L, Z_NULL, 0); + size_t zbuffUsed = zbuffSize; - ret1 = compressbuff(&stream, zbuff, &zbuffUsed, buff, (size_t)(ch-buff)); - if (ret1==Z_OK) ret2 = WRITE(f, zbuff, (int)zbuffUsed); - deflateEnd(&stream); - free(zbuff); -#endif + len = (size_t)(ch - buff); + crc = crc32(crc, (unsigned char*)buff, len); + ret1 = compressbuff(&strm, zbuff, &zbuffUsed, buff, len); + deflateEnd(&strm); + if (ret1==Z_OK) { + ret2 = WRITE(f, zbuff, (int)zbuffUsed); + compress_len += zbuffUsed; + } } else { +#endif ret2 = WRITE(f, buff, (int)(ch-buff)); +#ifndef NOZLIB } - free(buff); - if (ret1 || ret2==-1) { +#endif + if (ret0 == -1 || ret1 || ret2 == -1) { // # nocov start int errwrite = errno; // capture write errno now in case close fails with a different errno CLOSE(f); - if (ret1) STOP(_("Compress gzip error: %d"), ret1); - else STOP(_("%s: '%s'"), strerror(errwrite), args.filename); + free(buffPool); +#ifndef NOZLIB + free(zbuffPool); +#endif + if (ret0 == -1) STOP(_("Can't write gzip header error: %d"), ret0); + else if (ret1) STOP(_("Compress gzip error: %d"), ret1); + else STOP(_("%s: '%s'"), strerror(errwrite), args.filename); // # nocov end } } } - if (verbose) DTPRINT(_("done in %.3fs\n"), 1.0*(wallclock()-t0)); + if (verbose) + DTPRINT(_("Initialization done in %.3fs\n"), 1.0*(wallclock()-t0)); + + // empty file is test in fwrite.R if (args.nrow == 0) { - if (verbose) DTPRINT(_("No data rows present (nrow==0)\n")); - if (f!=-1 && CLOSE(f)) STOP(_("%s: '%s'"), strerror(errno), args.filename); + if (verbose) + DTPRINT(_("No data rows present (nrow==0)\n")); + free(buffPool); +#ifndef NOZLIB + free(zbuffPool); +#endif + if (f != -1 && CLOSE(f)) + STOP(_("%s: '%s'"), strerror(errno), args.filename); // # nocov return; } - // Writing rows + // Write rows ---- - // Decide buffer size and rowsPerBatch for each thread - // Once rowsPerBatch is decided it can't be changed - int rowsPerBatch=0; - if (maxLineLen*2>buffSize) { buffSize=2*maxLineLen; rowsPerBatch=2; } - else rowsPerBatch = buffSize / maxLineLen; - if (rowsPerBatch > args.nrow) rowsPerBatch = args.nrow; - if (rowsPerBatch < 1) rowsPerBatch = 1; - int numBatches = (args.nrow-1)/rowsPerBatch + 1; - int nth = args.nth; - if (numBatches < nth) nth = numBatches; - if (verbose) { - DTPRINT(_("Writing %"PRId64" rows in %d batches of %d rows (each buffer size %dMB, showProgress=%d, nth=%d)\n"), - args.nrow, numBatches, rowsPerBatch, args.buffMB, args.showProgress, nth); - } t0 = wallclock(); bool hasPrinted = false; int maxBuffUsedPC = 0; - // compute zbuffSize which is the same for each thread - size_t zbuffSize = 0; - if(args.is_gzip){ -#ifndef NOZLIB - z_stream stream = {0}; - if(init_stream(&stream)) - STOP(_("Can't allocate gzip stream structure")); // # nocov - zbuffSize = deflateBound(&stream, buffSize); - if (verbose) DTPRINT(_("zbuffSize=%d returned from deflateBound\n"), (int)zbuffSize); - deflateEnd(&stream); -#endif - } - - errno=0; - char *buffPool = malloc(nth*(size_t)buffSize); - if (!buffPool) { - // # nocov start - STOP(_("Unable to allocate %zu MB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), - (size_t)buffSize/(1024^2), nth, errno, strerror(errno)); - // # nocov end - } - char *zbuffPool = NULL; - if (args.is_gzip) { -#ifndef NOZLIB - zbuffPool = malloc(nth*(size_t)zbuffSize); - if (!zbuffPool) { - // # nocov start - free(buffPool); - STOP(_("Unable to allocate %zu MB * %d thread compressed buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), - (size_t)zbuffSize/(1024^2), nth, errno, strerror(errno)); - // # nocov end - } -#endif - } - bool failed = false; // naked (unprotected by atomic) write to bool ok because only ever write true in this special paradigm int failed_compress = 0; // the first thread to fail writes their reason here when they first get to ordered section int failed_write = 0; // same. could use +ve and -ve in the same code but separate it out to trace Solaris problem, #3931 -#ifndef NOZLIB - z_stream *thread_streams = (z_stream *)malloc(nth * sizeof(z_stream)); - if (!thread_streams) - STOP(_("Failed to allocated %d bytes for '%s'."), (int)(nth * sizeof(z_stream)), "thread_streams"); // # nocov - // VLA on stack should be fine for nth structs; in zlib v1.2.11 sizeof(struct)==112 on 64bit - // not declared inside the parallel region because solaris appears to move the struct in - // memory when the #pragma omp for is entered, which causes zlib's internal self reference - // pointer to mismatch, #4099 - char failed_msg[1001] = ""; // to hold zlib's msg; copied out of zlib in ordered section just in case the msg is allocated within zlib -#endif - - #pragma omp parallel num_threads(nth) - { +// main parallel loop ---- +#pragma omp parallel for ordered num_threads(nth) schedule(dynamic) + for(int64_t start=0; start < args.nrow; start += rowsPerBatch) { int me = omp_get_thread_num(); int my_failed_compress = 0; - char *ch, *myBuff; - ch = myBuff = buffPool + me*buffSize; + char* myBuff = buffPool + me * buffSize; + char* ch = myBuff; +#ifndef NOZLIB + z_stream mystream; + size_t mylen = 0; + int mycrc = 0; void *myzBuff = NULL; size_t myzbuffUsed = 0; -#ifndef NOZLIB - z_stream *mystream = &thread_streams[me]; if (args.is_gzip) { - myzBuff = zbuffPool + me*zbuffSize; - if (init_stream(mystream)) { // this should be thread safe according to zlib documentation + myzBuff = zbuffPool + me * zbuffSize; + if (init_stream(&mystream) != Z_OK) { // this should be thread safe according to zlib documentation failed = true; // # nocov my_failed_compress = -998; // # nocov } } #endif + if (failed) + continue; // Not break. Because we don't use #omp cancel yet. + int64_t end = ((args.nrow - start) < rowsPerBatch) ? args.nrow : start + rowsPerBatch; - #pragma omp for ordered schedule(dynamic) - for(int64_t start=0; start=1 because 0-columns was caught earlier. - write_chars(args.eol, &ch); // overwrite last sep with eol instead + *ch = sep; + ch += sepLen; } - // compress buffer if gzip + // Hot loop + for (int j=0; j=1 because 0-columns was caught earlier. + write_chars(args.eol, &ch); // overwrite last sep with eol instead + } // end of chunk rows loop + + // compress buffer if gzip #ifndef NOZLIB - if (args.is_gzip && !failed) { - myzbuffUsed = zbuffSize; - int ret = compressbuff(mystream, myzBuff, &myzbuffUsed, myBuff, (size_t)(ch-myBuff)); - if (ret) { failed=true; my_failed_compress=ret; } - else deflateReset(mystream); + if (args.is_gzip && !failed) { + myzbuffUsed = zbuffSize; + mylen = (size_t)(ch - myBuff); + mycrc = crc32(0, (unsigned char*)myBuff, mylen); + int ret = compressbuff(&mystream, myzBuff, &myzbuffUsed, myBuff, mylen); + deflateEnd(&mystream); + if (ret) { + failed=true; + my_failed_compress=ret; + } + } +#endif + + // ordered region ---- +#pragma omp ordered + if (failed) { + if (failed_compress==0 && my_failed_compress!=0) { + failed_compress = my_failed_compress; // # nocov } + // else another thread could have failed below while I was working or waiting above; their reason got here first + } else { + errno=0; + int ret = 0; + if (f == -1) { + *ch='\0'; // standard C string end marker so DTPRINT knows where to stop + DTPRINT("%s", myBuff); + } else +#ifndef NOZLIB + if (args.is_gzip) { + ret = WRITE(f, myzBuff, (int)myzbuffUsed); + compress_len += myzbuffUsed; + } else #endif - #pragma omp ordered { - if (failed) { - // # nocov start - if (failed_compress==0 && my_failed_compress!=0) { - failed_compress = my_failed_compress; + ret = WRITE(f, myBuff, (int)(ch-myBuff)); + } + if (ret == -1) { + failed=true; // # nocov + failed_write=errno; // # nocov + } + #ifndef NOZLIB - if (mystream->msg!=NULL) strncpy(failed_msg, mystream->msg, 1000); // copy zlib's msg for safe use after deflateEnd just in case zlib allocated the message + if (args.is_gzip) { + crc = crc32_combine(crc, mycrc, mylen); + len += mylen; + } #endif - } - // else another thread could have failed below while I was working or waiting above; their reason got here first + + int used = 100 * ((double)(ch - myBuff)) / buffSize; // percentage of original buffMB + if (used > maxBuffUsedPC) + maxBuffUsedPC = used; + double now; + if (me == 0 && !failed && args.showProgress && (now=wallclock()) >= nextTime) { + // See comments above inside the f==-1 clause. + // Not only is this ordered section one-at-a-time but we'll also Rprintf() here only from the + // master thread (me==0) and hopefully this will work on Windows. If not, user should set + // showProgress=FALSE until this can be fixed or removed. + int ETA = (int)((args.nrow - end) * (now-startTime) /end); + if (hasPrinted || ETA >= 2) { + // # nocov start + if (verbose && !hasPrinted) DTPRINT("\n"); + DTPRINT(Pl_(nth, + "\rWritten %.1f%% of %"PRId64" rows in %d secs using %d thread. maxBuffUsed=%d%%. ETA %d secs. ", + "\rWritten %.1f%% of %"PRId64" rows in %d secs using %d threads. maxBuffUsed=%d%%. ETA %d secs. "), + (100.0*end)/args.nrow, args.nrow, (int)(now-startTime), nth, maxBuffUsedPC, ETA); // # nocov + // TODO: use progress() as in fread + nextTime = now + 1; + hasPrinted = true; // # nocov end - } else { - errno=0; - if (f==-1) { - *ch='\0'; // standard C string end marker so DTPRINT knows where to stop - DTPRINT("%s", myBuff); - } else if ((args.is_gzip ? WRITE(f, myzBuff, (int)myzbuffUsed) - : WRITE(f, myBuff, (int)(ch-myBuff))) == -1) { - failed=true; // # nocov - failed_write=errno; // # nocov - } - - int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB - if (used > maxBuffUsedPC) maxBuffUsedPC = used; - double now; - if (me==0 && args.showProgress && (now=wallclock())>=nextTime && !failed) { - // See comments above inside the f==-1 clause. - // Not only is this ordered section one-at-a-time but we'll also Rprintf() here only from the - // master thread (me==0) and hopefully this will work on Windows. If not, user should set - // showProgress=FALSE until this can be fixed or removed. - // # nocov start - int ETA = (int)((args.nrow-end)*((now-startTime)/end)); - if (hasPrinted || ETA >= 2) { - if (verbose && !hasPrinted) DTPRINT("\n"); - DTPRINT(Pl_(nth, - "\rWritten %.1f%% of %"PRId64" rows in %d secs using %d thread. maxBuffUsed=%d%%. ETA %d secs. ", - "\rWritten %.1f%% of %"PRId64" rows in %d secs using %d threads. maxBuffUsed=%d%%. ETA %d secs. "), - (100.0*end)/args.nrow, args.nrow, (int)(now-startTime), nth, maxBuffUsedPC, ETA); - // TODO: use progress() as in fread - nextTime = now+1; - hasPrinted = true; - } - // # nocov end - } - // May be possible for master thread (me==0) to call R_CheckUserInterrupt() here. - // Something like: - // if (me==0) { - // failed = TRUE; // inside ordered here; the slaves are before ordered and not looking at 'failed' - // R_CheckUserInterrupt(); - // failed = FALSE; // no user interrupt so return state - // } - // But I fear the slaves will hang waiting for the master (me==0) to complete the ordered - // section which may not happen if the master thread has been interrupted. Rather than - // seeing failed=TRUE and falling through to free() and close() as intended. - // Could register a finalizer to free() and close() perhaps : - // [r-devel] http://r.789695.n4.nabble.com/checking-user-interrupts-in-C-code-tp2717528p2717722.html - // Conclusion for now: do not provide ability to interrupt. - // write() errors and malloc() fails will be caught and cleaned up properly, however. - ch = myBuff; // back to the start of my buffer ready to fill it up again } } } - // all threads will call this free on their buffer, even if one or more threads had malloc - // or realloc fail. If the initial malloc failed, free(NULL) is ok and does nothing. - if (args.is_gzip) { -#ifndef NOZLIB - deflateEnd(mystream); -#endif - } - } + + } // end of parallel for loop + free(buffPool); + #ifndef NOZLIB - free(thread_streams); free(zbuffPool); + +/* put a 4-byte integer into a byte array in LSB order */ +#define PUT4(a,b) ((a)[0]=(b), (a)[1]=(b)>>8, (a)[2]=(b)>>16, (a)[3]=(b)>>24) + + // write gzip tailer with crc and len + if (args.is_gzip) { + unsigned char tail[10]; + tail[0] = 3; + tail[1] = 0; + PUT4(tail + 2, crc); + PUT4(tail + 6, len); + int ret = WRITE(f, tail, 10); + compress_len += 10; + if (ret == -1) + STOP(_("Failed to write gzip trailer")); // # nocov + } #endif // Finished parallel region and can call R API safely now. @@ -1010,14 +1102,25 @@ void fwriteMain(fwriteMainArgs args) // # nocov start if (!failed) { // clear the progress meter DTPRINT("\r " - " \r"); + " \r\n"); } else { // don't clear any potentially helpful output before error DTPRINT("\n"); } // # nocov end } - if (f!=-1 && CLOSE(f) && !failed) + if (verbose) { +#ifndef NOZLIB + if (args.is_gzip) { + DTPRINT(_("zlib: uncompressed length=%zu (%zu MiB), compressed length=%zu (%zu MiB), ratio=%.1f%%, crc=%x\n"), + len, len / MEGA, compress_len, compress_len / MEGA, len != 0 ? (100.0 * compress_len) / len : 0, crc); + } +#endif + DTPRINT(_("Written %"PRId64" rows in %.3f secs using %d thread%s. MaxBuffUsed=%d%%\n"), + args.nrow, 1.0*(wallclock()-t0), nth, nth ==1 ? "" : "s", maxBuffUsedPC); + } + + if (f != -1 && CLOSE(f) && !failed) STOP("%s: '%s'", strerror(errno), args.filename); // # nocov // quoted '%s' in case of trailing spaces in the filename // If a write failed, the line above tries close() to clean up, but that might fail as well. So the @@ -1027,8 +1130,8 @@ void fwriteMain(fwriteMainArgs args) // # nocov start #ifndef NOZLIB if (failed_compress) - STOP(_("zlib %s (zlib.h %s) deflate() returned error %d with z_stream->msg==\"%s\" Z_FINISH=%d Z_BLOCK=%d. %s"), - zlibVersion(), ZLIB_VERSION, failed_compress, failed_msg, Z_FINISH, Z_BLOCK, + STOP(_("zlib %s (zlib.h %s) deflate() returned error %d Z_FINISH=%d Z_BLOCK=%d. %s"), + zlibVersion(), ZLIB_VERSION, failed_compress, Z_FINISH, Z_BLOCK, verbose ? _("Please include the full output above and below this message in your data.table bug report.") : _("Please retry fwrite() with verbose=TRUE and include the full output with your data.table bug report.")); #endif @@ -1037,5 +1140,3 @@ void fwriteMain(fwriteMainArgs args) // # nocov end } } - - diff --git a/src/fwrite.h b/src/fwrite.h index 867081a5fb..b133ec0944 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -113,6 +113,7 @@ typedef struct fwriteMainArgs int nth; bool showProgress; bool is_gzip; + int gzip_level; bool bom; const char *yaml; bool verbose; diff --git a/src/fwriteR.c b/src/fwriteR.c index 0d5c0969ef..b4353cd4f1 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -167,6 +167,7 @@ SEXP fwriteR( SEXP nThread_Arg, SEXP showProgress_Arg, SEXP is_gzip_Arg, + SEXP gzip_level_Arg, SEXP bom_Arg, SEXP yaml_Arg, SEXP verbose_Arg, @@ -177,6 +178,7 @@ SEXP fwriteR( fwriteMainArgs args = {0}; // {0} to quieten valgrind's uninitialized, #4639 args.is_gzip = LOGICAL(is_gzip_Arg)[0]; + args.gzip_level = INTEGER(gzip_level_Arg)[0]; args.bom = LOGICAL(bom_Arg)[0]; args.yaml = CHAR(STRING_ELT(yaml_Arg, 0)); args.verbose = LOGICAL(verbose_Arg)[0]; diff --git a/src/init.c b/src/init.c index 204dc1088d..0188cfaf65 100644 --- a/src/init.c +++ b/src/init.c @@ -72,6 +72,7 @@ R_CallMethodDef callMethods[] = { {"Crbindlist", (DL_FUNC) &rbindlist, -1}, {"Cvecseq", (DL_FUNC) &vecseq, -1}, {"Csetlistelt", (DL_FUNC) &setlistelt, -1}, +{"CsetS4elt", (DL_FUNC) &setS4elt, -1}, {"Caddress", (DL_FUNC) &address, -1}, {"CexpandAltRep", (DL_FUNC) &expandAltRep, -1}, {"Cfmelt", (DL_FUNC) &fmelt, -1}, diff --git a/src/wrappers.c b/src/wrappers.c index 6587caa97a..2cf46d9cfe 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -59,18 +59,26 @@ SEXP copy(SEXP x) return(duplicate(x)); } +// Internal use only. So that := can update elements of a list of data.table, #2204. Just needed to overallocate/grow the VECSXP. SEXP setlistelt(SEXP l, SEXP i, SEXP value) { - R_len_t i2; - // Internal use only. So that := can update elements of a list of data.table, #2204. Just needed to overallocate/grow the VECSXP. - if (!isNewList(l)) error(_("First argument to setlistelt must be a list()")); - if (!isInteger(i) || LENGTH(i)!=1) error(_("Second argument to setlistelt must a length 1 integer vector")); - i2 = INTEGER(i)[0]; + if (!isNewList(l)) internal_error(__func__, "First argument to setlistelt must be a list()"); + if (!isInteger(i) || LENGTH(i)!=1) internal_error(__func__, "Second argument to setlistelt must a length 1 integer vector"); + R_len_t i2 = INTEGER(i)[0]; if (LENGTH(l) < i2 || i2<1) error(_("i (%d) is outside the range of items [1,%d]"),i2,LENGTH(l)); SET_VECTOR_ELT(l, i2-1, value); return(R_NilValue); } +// Internal use only. So that := can update elements of a slot of data.table, #6701. +SEXP setS4elt(SEXP obj, SEXP name, SEXP value) +{ + if (!isS4(obj)) internal_error(__func__, "First argument to setS4elt must be an S4 object"); + if (!isString(name) || LENGTH(name)!=1) internal_error(__func__, "Second argument to setS4elt must be a character string"); + R_do_slot_assign(obj, name, value); + return(R_NilValue); +} + SEXP address(SEXP x) { // A better way than : http://stackoverflow.com/a/10913296/403310 diff --git a/vignettes/datatable-joins.Rmd b/vignettes/datatable-joins.Rmd index a35f78bb21..cb880deafe 100644 --- a/vignettes/datatable-joins.Rmd +++ b/vignettes/datatable-joins.Rmd @@ -126,12 +126,12 @@ The next diagram shows a description for each basic argument. In the following s x[i, on, nomatch] | | | | | | | \__ If NULL only returns rows linked in x and i tables -| | \____ a character vector o list defining match logict +| | \____ a character vector or list defining match logic | \_____ primary data.table, list or data.frame \____ secondary data.table ``` -> Please keep in mind that the standard argument order in data.table is `dt[i, j, by]`. For join operations, it is recommended to pass the `on` and `nomatch` arguments by name to avoid using `j` and `by` when they are not needed. +> Please keep in mind that the standard argument order in `data.table` is `dt[i, j, by]`. For join operations, it is recommended to pass the `on` and `nomatch` arguments by name to avoid using `j` and `by` when they are not needed. ## 3. Equi joins @@ -160,8 +160,8 @@ Products[ProductReceived, As many things have changed, let's explain the new characteristics in the following groups: - **Column level** - - The *first group* of columns in the new data.table comes from the `x` table. - - The *second group* of columns in the new data.table comes from the `i` table. + - The *first group* of columns in the new `data.table` comes from the `x` table. + - The *second group* of columns in the new `data.table` comes from the `i` table. - If the join operation presents a present any **name conflict** (both table have same column name) the ***prefix*** `i.` is added to column names from the **right-hand table** (table on `i` position). - **Row level** @@ -183,7 +183,7 @@ Products[ProductReceived, on = list(id = product_id)] ``` -- Wrapping the related columns in the data.table `list` alias `.`. +- Wrapping the related columns in the `data.table` `list` alias `.`. ```{r, eval=FALSE} Products[ProductReceived, @@ -249,7 +249,7 @@ Products[ ``` -##### Summarizing with on in data.table +##### Summarizing with `on` in `data.table` We can also use this alternative to return aggregated results based columns present in the `x` table. @@ -302,18 +302,18 @@ ProductReceived[Products, nomatch = NULL] ``` -Despite both tables have the same information, they present some relevant differences: +Despite both tables having the same information, there are some relevant differences: -- They present different order for their columns -- They have some name differences on their columns names: - - The `id` column of first table has the same information as the `product_id` in the second table. - - The `i.id` column of first table has the same information as the `id` in the second table. +- They present different column ordering. +- They have column name differences: + - The `id` column in the first table has the same information as the `product_id` in the second table. + - The `i.id` column in the first table has the same information as the `id` in the second table. ### 3.3. Not join This method **keeps only the rows that don't match with any row of a second table**. -To apply this technique we just need to negate (`!`) the table located on the `i` argument. +To apply this technique we can negate (`!`) the table located on the `i` argument. ```{r} Products[!ProductReceived, @@ -331,7 +331,7 @@ In this case, the operation returns the row with `product_id = 6,` as it is not ### 3.4. Semi join -This method extract **keeps only the rows that match with any row in a second table** without combining the column of the tables. +This method extracts **only the rows that match any row in a second table**, without combining the columns of the tables. It's very similar to subset as join, but as in this time we are passing a complete table to the `i` we need to ensure that: @@ -391,7 +391,7 @@ Here some important considerations: - **Row level** - All rows from in the `i` table were kept as we never received any banana but row is still part of the results. - - The row related to `product_id = 6` is no part of the results any more as it is not present in the `Products` table. + - The row related to `product_id = 6` is not part of the results any more as it is not present in the `Products` table. #### 3.5.1. Joining after chain operations @@ -510,7 +510,7 @@ Use this method if you need to combine columns from 2 tables based on one or mor As we saw in the previous section, any of the prior operations can keep the missing `product_id = 6` and the **soda** (`product_id = 4`) as part of the results. -To save this problem, we can use the `merge` function even thought it is lower than using the native `data.table`'s joining syntax. +To save this problem, we can use the `merge` function even though it is lower than using the native `data.table`'s joining syntax. ```{r} merge(x = Products, @@ -524,24 +524,24 @@ merge(x = Products, ## 4. Non-equi join -A non-equi join is a type of join where the condition for matching rows is not based on equality, but on other comparison operators like <, >, <=, or >=. This allows for **more flexible joining criteria**. In `data.table`, non-equi joins are particularly useful for operations like: +A non-equi join is a type of join where the condition for matching rows is based on comparison operators other than equality, such as `<`, `>`, `<=`, or `>=`. This allows for **more flexible joining criteria**. In `data.table`, non-equi joins are particularly useful for operations like: -- Finding the nearest match -- Comparing ranges of values between tables +- Finding the nearest match. +- Comparing ranges of values between tables. -It's a great alternative if after applying a right of inner join: +It is a great alternative when, after applying a right or inner join, you: -- You want to decrease the number of returned rows based on comparing numeric columns of different table. -- You don't need to keep the columns from table `x`*(secondary data.table)* in the final table. +- Want to reduce the number of returned rows based on comparisons of numeric columns between tables. +- Do not need to retain the columns from table x *(the secondary `data.table`)* in the final result. -To illustrate how this work, let's center over attention on how are the sales and receives for product 2. +To illustrate how this works, let's focus on the sales and receives for product 2. ```{r} ProductSalesProd2 = ProductSales[product_id == 2L] ProductReceivedProd2 = ProductReceived[product_id == 2L] ``` -If want to know, for example, if can find any receive that took place before a sales date, we can apply the next code. +If want to know, for example, you can find any receive that took place before a sales date, we can apply the following. ```{r} ProductReceivedProd2[ProductSalesProd2, @@ -552,16 +552,16 @@ ProductReceivedProd2[ProductSalesProd2, What does happen if we just apply the same logic on the list passed to `on`? -- As this opperation it's still a right join, it returns all rows from the `i` table, but only shows the values for `id` and `count` when the rules are met. +- As this operation is still a right join, it returns all rows from the `i` table, but only shows the values for `id` and `count` when the rules are met. -- The date related `ProductReceivedProd2` was omited from this new table. +- The date related `ProductReceivedProd2` was omitted from this new table. ```{r} ProductReceivedProd2[ProductSalesProd2, on = list(product_id, date < date)] ``` -Now, after applying the join, we can limit the results only show the cases that meet all joining criteria. +Now, after applying the join, we can limit the results only showing the cases that meet all joining criteria. ```{r} ProductReceivedProd2[ProductSalesProd2, @@ -574,7 +574,7 @@ ProductReceivedProd2[ProductSalesProd2, Rolling joins are particularly useful in time-series data analysis. They allow you to **match rows based on the nearest value** in a sorted column, typically a date or time column. -This is valuable when you need to align data from different sources **that may not have exactly matching timestamps**, or when you want to carry forward the most recent value. +This is valuable when you need to align data from different sources **that may not have exact matching timestamps**, or when you want to carry forward the most recent value. For example, in financial data, you might use a rolling join to assign the most recent stock price to each transaction, even if the price updates and transactions don't occur at the exact same times. @@ -594,7 +594,7 @@ ProductPriceHistory = data.table( ProductPriceHistory ``` -Now, we can perform a right join giving a different prices for each product based on the sale date. +Now, we can perform a right join giving a different price for each product based on the sale date. ```{r} ProductPriceHistory[ProductSales, @@ -613,13 +613,13 @@ ProductPriceHistory[ProductSales, j = .(product_id, date, count, price)] ``` -## 7. Taking advange of joining speed +## 7. Taking advantage of joining speed ### 7.1. Subsets as joins -As we just saw in the prior section the `x` table gets filtered by the values available in the `i` table. Actually, that process is faster than passing a Boolean expression to the `i` argument. +As we just saw in the prior section the `x` table gets filtered by the values available in the `i` table. This process is faster than passing a Boolean expression to the `i` argument. -To filter the `x` table at speed we don't to pass a complete `data.table`, we can pass a `list()` of vectors with the values that we want to keep or omit from the original table. +To filter the `x` table at speed we don't need to pass a complete `data.table`, we can pass a `list()` of vectors with the values that we want to keep or omit from the original table. For example, to filter dates where the market received 100 units of bananas (`product_id = 1`) or popcorn (`product_id = 3`) we can use the following: @@ -628,7 +628,7 @@ ProductReceived[list(c(1L, 3L), 100L), on = c("product_id", "count")] ``` -As at the end, we are filtering based on a join operation the code returned a **row that was not present in original table**. To avoid that behavior, it is recommended to always to add the argument `nomatch = NULL`. +As at the end, we are filtering based on a join operation the code returned a **row that was not present in original table**. To avoid that behavior, it is recommended to always add the argument `nomatch = NULL`. ```{r} ProductReceived[list(c(1L, 3L), 100L), @@ -644,7 +644,7 @@ ProductReceived[!list(c(1L, 3L), 100L), on = c("product_id", "count")] ``` -If you just want to filter a value for a single **character column**, you can omit calling the `list()` function pass the value to been filtered in the `i` argument. +If you just want to filter a value for a single **character column**, you can omit calling the `list()` function and pass the value to be filtered in the `i` argument. ```{r} Products[c("banana","popcorn"), @@ -660,7 +660,7 @@ Products[!"popcorn", ### 7.2. Updating by reference -The `:=` operator in data.table is used for updating or adding columns by reference. This means it modifies the original data.table without creating a copy, which is very memory-efficient, especially for large datasets. When used inside a data.table, `:=` allows you to **add new columns** or **modify existing ones** as part of your query. +The `:=` operator in `data.table` is used for updating or adding columns by reference. This means it modifies the original `data.table` without creating a copy, which is very memory-efficient, especially for large datasets. When used inside a `data.table`, `:=` allows you to **add new columns** or **modify existing ones** as part of your query. Let's update our `Products` table with the latest price from `ProductPriceHistory`: @@ -674,7 +674,7 @@ copy(Products)[ProductPriceHistory, In this operation: -- The function `copy` prevent that `:=` changes by reference the `Products` table.s +- The function copy creates a ***deep*** copy of the `Products` table, preventing modifications made by `:=` from changing the original table by reference. - We join `Products` with `ProductPriceHistory` based on `id` and `product_id`. - We update the `price` column with the latest price from `ProductPriceHistory`. - We add a new `last_updated` column to track when the price was last changed. diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index b6d895af14..1c7e812e72 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -118,12 +118,33 @@ Note that the code above explains how `:=` can be used. They are not working exa * On the other hand, (b) is handy if you would like to jot some comments down for later. +* In general, (a) and (b) should produce the same result, however there are minor differences in certain cases due to the implementation details of `:=`. Specifically, when using (b), the `:=` operator behaves like an alias to `list`, therefore if `RHS` is a `list` in functional form the column will become a list. See example below. + * The result is returned *invisibly*. * Since `:=` is available in `j`, we can combine it with `i` and `by` operations just like the aggregation operations we saw in the previous vignette. # +Although in most cases they are the same, there is a minor difference between the standard and functional forms mentioned above. Let's see an example to understand this. +```{r} +DT = data.table(a = list('A', 'B', 'C')) +l = list(1L:3L) + +DT[, new_int := l] # Standard form, new column is integer + +DT[, `:=`(new_list = l)] # Functional form, new column is a list of integer vectors + +DT[, new_list := list(l)] # Same as above, we can see that `:=` is an alias to list in functional form + +# This can be avoided by using a vector instead of a list: +v = 1L:3L + +DT[, new_int := v] # Standard form, new column is integer + +DT[, `:=`(new_int = v)] # Functional form, new column is integer +``` + In the two forms of `:=` shown above, note that we don't assign the result back to a variable. Because we don't need to. The input *data.table* is modified by reference. Let's go through examples to understand what we mean by this. For the rest of the vignette, we will work with `flights` *data.table*.