From d794b3b754a1b003d296e643686307942db03714 Mon Sep 17 00:00:00 2001 From: Otto Seiskari Date: Sat, 26 Mar 2016 20:58:52 +0200 Subject: [PATCH 1/6] First version of the fwrite function #580 --- NAMESPACE | 1 + R/fwrite.R | 62 +++++++++++++++++++++++ README.md | 2 + inst/tests/tests.Rraw | 112 ++++++++++++++++++++++++++++++++++++++++++ man/fwrite.Rd | 41 ++++++++++++++++ src/fwrite.c | 59 ++++++++++++++++++++++ src/init.c | 2 + 7 files changed, 279 insertions(+) create mode 100644 R/fwrite.R create mode 100644 man/fwrite.Rd create mode 100644 src/fwrite.c diff --git a/NAMESPACE b/NAMESPACE index 45dd8f8269..3e6b1bd85d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(setNumericRounding, getNumericRounding) export(chmatch, "%chin%", chorder, chgroup) export(rbindlist) export(fread) +export(fwrite) export(foverlaps) export(shift) export(transpose) diff --git a/R/fwrite.R b/R/fwrite.R new file mode 100644 index 0000000000..7d5e5074e7 --- /dev/null +++ b/R/fwrite.R @@ -0,0 +1,62 @@ +fwrite <- function(dt, file.path, append = FALSE, quote = TRUE, + sep = ",", eol = "\n", col.names = TRUE, qmethod = "double", + block.size = 10000) { + + # validate arguments + stopifnot(is.data.frame(dt)) + stopifnot(ncol(dt) > 0) + stopifnot(identical(unique(names(dt)), names(dt))) + + stopifnot(length(quote) == 1 && class(quote) == "logical") + stopifnot(length(sep) == 1 && class(sep) == "character" && nchar(sep) == 1) + stopifnot(length(eol) == 1 && class(eol) == "character") + stopifnot(length(qmethod) == 1 && qmethod %in% c("double", "escape")) + stopifnot(length(col.names) == 1 && class(col.names) == "logical") + stopifnot(length(append) == 1 && class(append) == "logical") + stopifnot(length(block.size) == 1 && block.size > 0) + + quoted_cols <- rep(quote, ncol(dt)) + + # special case: single-column data.frame, doing dt[block_begin:block_end,] + # for such data frame gives a vector + if (!is.data.table(dt) && ncol(dt) == 1) dt <- as.data.table(dt) + + # write header row separately for correct quoting of row names + if (col.names && !append) { + .Call(Cwritefile, as.list(names(dt)), file.path, sep, eol, quoted_cols, qmethod == "escape", append) + append <- TRUE + } + + # handle empty dt + if (nrow(dt) == 0) return() + + # determine from column types, which ones should be quoted + if (quote) { + column_types <- lapply(dt, class) + quoted_cols <- column_types %in% c('character', 'factor') + } + + # write in blocks of given size to avoid generating full copies + # of columns in memory + block_begin <- 1 + + repeat { + block_end <- min(block_begin+(block.size-1), nrow(dt)) + + dt_block <- dt[c(block_begin:block_end),] + + # convert data.frame row block to a list of columns + col_list <- lapply(dt_block, function(column) { + str_col <- as.character(column) + str_col[is.na(str_col)] <- '' + str_col + }) + + .Call(Cwritefile, col_list, file.path, sep, eol, quoted_cols, qmethod == "escape", append) + + if (block_end == nrow(dt)) break + + append <- TRUE + block_begin <- block_end+1 + } +} \ No newline at end of file diff --git a/README.md b/README.md index 3906eea132..8546c231b0 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,8 @@ 28. Joins (and binary search based subsets) using `on=` argument now reuses existing (secondary) indices, [#1439](https://github.com/Rdatatable/data.table/issues/1439). Thanks @jangorecki. 29. New `split` method for data.table. Faster, more flexible and consistent with data.frame method. Closes [#1389](https://github.com/Rdatatable/data.table/issues/1389). + + 30. New function `fwrite`. Fixes [#580](https://github.com/Rdatatable/data.table/issues/580). Thanks @oseiskar. #### BUG FIXES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7ffb66b7b6..c85d8f6083 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8404,6 +8404,118 @@ test(1639.137, sort.by.names(ans), sort.by.names(unlist(split(setDT(df), by=c("p test(1639.138, ans, split(as.data.table(df), by=c("product","year"))) test(1639.139, sort.by.names(ans), sort.by.names(unlist(split(as.data.table(df), by=c("product","year"), flatten=FALSE), recursive = FALSE))) +# fwrite tests +local({ + fwrite_test <- function(number, writer, expected_result) { + f <- tempfile() + writer(f) + result <- readChar(f, file.info(f)$size) + unlink(f) + test(number, result, expected_result) + } + + # without quoting + fwrite_test(1640.1, function(f) { + fwrite(data.table(a=c(NA, 2, 3.01), b=c('foo', NA, 'bar')), f, quote=F) + }, 'a,b\n,foo\n2,\n3.01,bar\n') + + # with quoting and qmethod="escape" + fwrite_test(1640.2, function(f) { + fwrite(data.table( + a=c(NA, 2, 3.01), + `other column`=c('foo bar', NA, 'quote" and \\ bs \n and newline')), + f, quote=T, qmethod="escape") + }, '"a","other column"\n,"foo bar"\n2,""\n3.01,"quote\\" and \\\\ bs \n and newline"\n') + + # with quoting and qmethod="double" (default) + fwrite_test(1640.3, function(f) { + fwrite(data.table( + a=c(NA, 1e-10, 3.01), + `other "column`=c('foo bar', NA, 'quote" and \\ bs')), + f, quote=T, qmethod="double") + }, '"a","other ""column"\n,"foo bar"\n1e-10,""\n3.01,"quote"" and \\ bs"\n') + + # changing sep + fwrite_test(1640.4, function(f) { fwrite(data.table(a="foo", b="ba\"r"), f, sep=";") }, + '"a";"b"\n"foo";"ba""r"\n') + + # changing eol + fwrite_test(1640.5, function(f) { fwrite(data.table(a="foo", b="bar"), f, eol="\r\n") }, + '"a","b"\r\n"foo","bar"\r\n') + + # no col.names + fwrite_test(1640.6, function(f) { fwrite(data.table(a="foo", b="bar"), f, col.names=F) }, + '"foo","bar"\n') + + # small block size to assert that blocking works correctly + fwrite_test(1640.7, function(f) { fwrite(data.table(a=c(1:5), b=c(1:5)), f, block.size=2) }, + '"a","b"\n1,1\n2,2\n3,3\n4,4\n5,5\n') + + # block size equal to number of rows + fwrite_test(1640.8, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=3) }, + '"a","b"\n1,1\n2,2\n3,3\n') + + # block size one bigger than number of rows + fwrite_test(1640.9, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=4) }, + '"a","b"\n1,1\n2,2\n3,3\n') + + # block size one less than number of rows + fwrite_test(1640.10, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=2) }, + '"a","b"\n1,1\n2,2\n3,3\n') + + # writing a data.frame + fwrite_test(1640.11, function(f) { fwrite(data.frame(a="foo", b="bar"), f) }, + '"a","b"\n"foo","bar"\n') + + # single-column data.table + fwrite_test(1640.12, function(f) { fwrite(data.table(a=c(1,2,3)), f) }, + '"a"\n1\n2\n3\n') + + # single-column data.frame + fwrite_test(1640.13, function(f) { fwrite(data.frame(a=c(1,2,3)), f) }, + '"a"\n1\n2\n3\n') + + # factor columns + fwrite_test(1640.14, function(f) { + fwrite(data.table(a=as.factor(c('foo', 'bar')), b=as.factor(c(NA, "baz"))), f) + }, '"a","b"\n"foo",""\n"bar","baz"\n') + + # empty data table (headers but no rows) + empty_dt <- data.table(a=1, b=2)[0,] + fwrite_test(1640.15, function(f) { fwrite(empty_dt, f) }, '"a","b"\n') + + # test append + f <- tempfile() + fwrite(data.table(a=c(1,2), b=c('a', 'b')), f) + fwrite(data.table(a=c(3,4), b=c('c', 'd')), f, append=T) + res <- readChar(f, file.info(f)$size) + unlink(f) + test(1640.16, res, '"a","b"\n1,"a"\n2,"b"\n3,"c"\n4,"d"\n') + + # simple data table (reference for the error cases below) + ok_dt <- data.table(foo="bar") + fwrite_test(1640.17, function(f) { fwrite(ok_dt, f) }, '"foo"\n"bar"\n') + + # error cases + fwrite_expect_error <- function(test_number, writer) { + f <- tempfile() + was_error <- F + tryCatch(writer(f), error=function(e) { was_error <<- T }) + test(test_number, TRUE, was_error) + stopifnot(!file.exists(f)) + } + + # wrong argument types + fwrite_expect_error(1640.18, function(f) {fwrite(ok_dt, 1)}) + fwrite_expect_error(1640.19, function(f) {fwrite(ok_dt, f, quote=123)}) + fwrite_expect_error(1640.20, function(f) {fwrite(ok_dt, f, sep="...")}) + fwrite_expect_error(1640.21, function(f) {fwrite(ok_dt, f, qmethod=c("double", "double"))}) + fwrite_expect_error(1640.22, function(f) {fwrite(ok_dt, f, col.names="foobar")}) + fwrite_expect_error(1640.23, function(f) {fwrite(data.table(a=1, a=2), f)}) + + # null data table (no columns) + fwrite_expect_error(1640.24, function(f) {fwrite(data.table(a=1)[NULL,], f)}) +}) ########################## diff --git a/man/fwrite.Rd b/man/fwrite.Rd new file mode 100644 index 0000000000..ceea135b85 --- /dev/null +++ b/man/fwrite.Rd @@ -0,0 +1,41 @@ +\name{fwrite} +\alias{fwrite} +\title{Fast CSV writer} +\description{ + Similar to \code{write.table} but faster and more limited in features. +} +\usage{ +fwrite(dt, file.path, append = FALSE, quote = TRUE, sep = ",", eol = "\n", + col.names = TRUE, qmethod = "double", block.size = 10000) +} +\arguments{ + \item{dt}{The \code{data.table} or \code{data.frame} to be write} + \item{file.path}{Output file name} + \item{append}{If \code{TRUE}, the file is opened in append mode and column names (header row) are not written.} + \item{quote}{If \code{TRUE}, all columns of character and factor types, as well as all column names, will be surrounded by double quotes. If \code{FALSE}, nothing is quoted, even if this would break the CSV (the column contents are not checked for separator characters).} + \item{sep}{The separator between columns} + \item{eol}{Line separator} + \item{col.names}{A logical value indicating if the column names (header row) should be written} + \item{qmethod}{A character string specifying how to deal with embedded double quote characters when quoting strings. Must be one of "escape", in which case the quote character (as well as the backslash character) is escaped in C style by a backslash, or "double" (default), in which case it is doubled.} + \item{block.size}{The output is written in blocks, each of which contains at most this number of rows. This is to avoid making large copies in memory. Can be used to tweak performance and memory usage.} +} +\details{ +\code{NA} values are always represented by empty strings, quoted for character and factor columns if \code{quote=TRUE}. + +The speed-up compared to \code{write.csv} depends on the parameters and column types. This version should be significantly faster if the table consists mainly of character columns. In addition, improved performance can be expected if one can make do with \code{quote=FALSE}. +} +\seealso{ \code{\link[utils]{write.csv}} } +\examples{ +\dontrun{ + +fwrite(data.table(first=c(1,2), second=c(NA, 'foo"bar')), "table.csv") + +# table.csv contains: + +# "first","second" +# "1","" +# "2","foo""bar" +} +} +\keyword{ data } + diff --git a/src/fwrite.c b/src/fwrite.c new file mode 100644 index 0000000000..108810fb14 --- /dev/null +++ b/src/fwrite.c @@ -0,0 +1,59 @@ +#include +#include +#include + +void writefile(SEXP list_of_columns, + SEXP filename, + SEXP col_sep_exp, + SEXP row_sep_exp, + SEXP quote_cols, + SEXP qmethod_escape_exp, + SEXP append) { + + int error_number = 0; + int qmethod_escape = *LOGICAL(qmethod_escape_exp); + + errno = 0; /* clear flag possibly set by previous errors */ + + char col_sep = *CHAR(STRING_ELT(col_sep_exp, 0)); + const char *row_sep = CHAR(STRING_ELT(row_sep_exp, 0)); + const char QUOTE_CHAR = '"'; + const char ESCAPE_CHAR = '\\'; + + /* open input file in correct mode */ + const char *open_mode = "wb"; + if (*LOGICAL(append)) open_mode = "ab"; + FILE *f = fopen(CHAR(STRING_ELT(filename, 0)), open_mode); + if (f == NULL) goto end; + + R_xlen_t ncols = LENGTH(list_of_columns); + R_xlen_t nrows = LENGTH(VECTOR_ELT(list_of_columns, 0)); + + for (R_xlen_t row_i = 0; row_i < nrows; ++row_i) { + for (int col_i = 0; col_i < ncols; ++col_i) { + + if (col_i > 0) fputc(col_sep, f); + + int quote = LOGICAL(quote_cols)[col_i]; + + if (quote) fputc(QUOTE_CHAR, f); + for (const char *ch = CHAR(STRING_ELT(VECTOR_ELT(list_of_columns, col_i), row_i)); *ch != '\0'; ++ch) { + if (quote) { + if (*ch == QUOTE_CHAR) { + if (qmethod_escape) fputc(ESCAPE_CHAR, f); + else fputc(QUOTE_CHAR, f); /* qmethod = "double" */ + } + if (qmethod_escape && *ch == ESCAPE_CHAR) fputc(ESCAPE_CHAR, f); + } + fputc(*ch, f); + } + if (quote) fputc(QUOTE_CHAR, f); + } + if (fputs(row_sep, f) < 0) goto end; + } + + end: + error_number = errno; + if (f != NULL) fclose(f); + if (error_number) error(strerror(errno)); +} \ No newline at end of file diff --git a/src/init.c b/src/init.c index 7d329e681a..0bc007cfec 100644 --- a/src/init.c +++ b/src/init.c @@ -17,6 +17,7 @@ SEXP setcharvec(); SEXP setcolorder(); SEXP chmatchwrapper(); SEXP readfile(); +SEXP writefile(); SEXP reorder(); SEXP rbindlist(); SEXP vecseq(); @@ -87,6 +88,7 @@ R_CallMethodDef callMethods[] = { {"Csetcolorder", (DL_FUNC) &setcolorder, -1}, {"Cchmatchwrapper", (DL_FUNC) &chmatchwrapper, -1}, {"Creadfile", (DL_FUNC) &readfile, -1}, +{"Cwritefile", (DL_FUNC) &writefile, -1}, {"Creorder", (DL_FUNC) &reorder, -1}, {"Crbindlist", (DL_FUNC) &rbindlist, -1}, {"Cvecseq", (DL_FUNC) &vecseq, -1}, From 8696b9fc156ef1f021da35f549d6a71cf56844f9 Mon Sep 17 00:00:00 2001 From: Otto Seiskari Date: Mon, 28 Mar 2016 11:17:23 +0300 Subject: [PATCH 2/6] minor changes in fwrite: renamed param dt to x, removed requirement of unique column names, replaced %in% -> %chin% --- R/fwrite.R | 29 ++++++++++++++--------------- inst/tests/tests.Rraw | 4 +++- man/fwrite.Rd | 4 ++-- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 7d5e5074e7..566fd9ba3e 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -1,11 +1,10 @@ -fwrite <- function(dt, file.path, append = FALSE, quote = TRUE, +fwrite <- function(x, file.path, append = FALSE, quote = TRUE, sep = ",", eol = "\n", col.names = TRUE, qmethod = "double", block.size = 10000) { # validate arguments - stopifnot(is.data.frame(dt)) - stopifnot(ncol(dt) > 0) - stopifnot(identical(unique(names(dt)), names(dt))) + stopifnot(is.data.frame(x)) + stopifnot(ncol(x) > 0) stopifnot(length(quote) == 1 && class(quote) == "logical") stopifnot(length(sep) == 1 && class(sep) == "character" && nchar(sep) == 1) @@ -15,25 +14,25 @@ fwrite <- function(dt, file.path, append = FALSE, quote = TRUE, stopifnot(length(append) == 1 && class(append) == "logical") stopifnot(length(block.size) == 1 && block.size > 0) - quoted_cols <- rep(quote, ncol(dt)) + quoted_cols <- rep(quote, ncol(x)) - # special case: single-column data.frame, doing dt[block_begin:block_end,] + # special case: single-column data.frame, doing x[block_begin:block_end,] # for such data frame gives a vector - if (!is.data.table(dt) && ncol(dt) == 1) dt <- as.data.table(dt) + if (!is.data.table(x) && ncol(x) == 1) x <- as.data.table(x) # write header row separately for correct quoting of row names if (col.names && !append) { - .Call(Cwritefile, as.list(names(dt)), file.path, sep, eol, quoted_cols, qmethod == "escape", append) + .Call(Cwritefile, as.list(names(x)), file.path, sep, eol, quoted_cols, qmethod == "escape", append) append <- TRUE } - # handle empty dt - if (nrow(dt) == 0) return() + # handle empty x + if (nrow(x) == 0) return() # determine from column types, which ones should be quoted if (quote) { - column_types <- lapply(dt, class) - quoted_cols <- column_types %in% c('character', 'factor') + column_types <- sapply(x, class) + quoted_cols <- column_types %chin% c('character', 'factor') } # write in blocks of given size to avoid generating full copies @@ -41,9 +40,9 @@ fwrite <- function(dt, file.path, append = FALSE, quote = TRUE, block_begin <- 1 repeat { - block_end <- min(block_begin+(block.size-1), nrow(dt)) + block_end <- min(block_begin+(block.size-1), nrow(x)) - dt_block <- dt[c(block_begin:block_end),] + dt_block <- x[c(block_begin:block_end),] # convert data.frame row block to a list of columns col_list <- lapply(dt_block, function(column) { @@ -54,7 +53,7 @@ fwrite <- function(dt, file.path, append = FALSE, quote = TRUE, .Call(Cwritefile, col_list, file.path, sep, eol, quoted_cols, qmethod == "escape", append) - if (block_end == nrow(dt)) break + if (block_end == nrow(x)) break append <- TRUE block_begin <- block_end+1 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c85d8f6083..3e9f4811b8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8484,6 +8484,9 @@ local({ empty_dt <- data.table(a=1, b=2)[0,] fwrite_test(1640.15, function(f) { fwrite(empty_dt, f) }, '"a","b"\n') + # data.table with duplicate column names + fwrite_test(1640.151, function(f) {fwrite(data.table(a=1, a=2), f)}, '"a","a"\n1,2\n') + # test append f <- tempfile() fwrite(data.table(a=c(1,2), b=c('a', 'b')), f) @@ -8511,7 +8514,6 @@ local({ fwrite_expect_error(1640.20, function(f) {fwrite(ok_dt, f, sep="...")}) fwrite_expect_error(1640.21, function(f) {fwrite(ok_dt, f, qmethod=c("double", "double"))}) fwrite_expect_error(1640.22, function(f) {fwrite(ok_dt, f, col.names="foobar")}) - fwrite_expect_error(1640.23, function(f) {fwrite(data.table(a=1, a=2), f)}) # null data table (no columns) fwrite_expect_error(1640.24, function(f) {fwrite(data.table(a=1)[NULL,], f)}) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index ceea135b85..254a8a18ec 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -5,11 +5,11 @@ Similar to \code{write.table} but faster and more limited in features. } \usage{ -fwrite(dt, file.path, append = FALSE, quote = TRUE, sep = ",", eol = "\n", +fwrite(x, file.path, append = FALSE, quote = TRUE, sep = ",", eol = "\n", col.names = TRUE, qmethod = "double", block.size = 10000) } \arguments{ - \item{dt}{The \code{data.table} or \code{data.frame} to be write} + \item{x}{The \code{data.table} or \code{data.frame} to write} \item{file.path}{Output file name} \item{append}{If \code{TRUE}, the file is opened in append mode and column names (header row) are not written.} \item{quote}{If \code{TRUE}, all columns of character and factor types, as well as all column names, will be surrounded by double quotes. If \code{FALSE}, nothing is quoted, even if this would break the CSV (the column contents are not checked for separator characters).} From 73aeff8c7fc123f0bbe0bf370395c162ac883420 Mon Sep 17 00:00:00 2001 From: Otto Seiskari Date: Mon, 28 Mar 2016 18:12:10 +0300 Subject: [PATCH 3/6] Improved fwrite performance by handling number formatting and NAs in C. Also added na option to fwrite. --- R/fwrite.R | 13 +++++++------ inst/tests/tests.Rraw | 18 +++++++++++++----- man/fwrite.Rd | 7 +++---- src/fwrite.c | 40 ++++++++++++++++++++++++++++++---------- 4 files changed, 53 insertions(+), 25 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 566fd9ba3e..0571737b57 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -1,5 +1,5 @@ fwrite <- function(x, file.path, append = FALSE, quote = TRUE, - sep = ",", eol = "\n", col.names = TRUE, qmethod = "double", + sep = ",", eol = "\n", na = "", col.names = TRUE, qmethod = "double", block.size = 10000) { # validate arguments @@ -22,7 +22,7 @@ fwrite <- function(x, file.path, append = FALSE, quote = TRUE, # write header row separately for correct quoting of row names if (col.names && !append) { - .Call(Cwritefile, as.list(names(x)), file.path, sep, eol, quoted_cols, qmethod == "escape", append) + .Call(Cwritefile, as.list(names(x)), file.path, sep, eol, na, quoted_cols, qmethod == "escape", append) append <- TRUE } @@ -46,12 +46,13 @@ fwrite <- function(x, file.path, append = FALSE, quote = TRUE, # convert data.frame row block to a list of columns col_list <- lapply(dt_block, function(column) { - str_col <- as.character(column) - str_col[is.na(str_col)] <- '' - str_col + if (!(class(column) %chin% c('integer', 'numeric', 'character'))) { + column <- as.character(column) + } + column }) - .Call(Cwritefile, col_list, file.path, sep, eol, quoted_cols, qmethod == "escape", append) + .Call(Cwritefile, col_list, file.path, sep, eol, na, quoted_cols, qmethod == "escape", append) if (block_end == nrow(x)) break diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3e9f4811b8..45cddc4515 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8425,7 +8425,7 @@ local({ a=c(NA, 2, 3.01), `other column`=c('foo bar', NA, 'quote" and \\ bs \n and newline')), f, quote=T, qmethod="escape") - }, '"a","other column"\n,"foo bar"\n2,""\n3.01,"quote\\" and \\\\ bs \n and newline"\n') + }, '"a","other column"\n,"foo bar"\n2,\n3.01,"quote\\" and \\\\ bs \n and newline"\n') # with quoting and qmethod="double" (default) fwrite_test(1640.3, function(f) { @@ -8433,7 +8433,7 @@ local({ a=c(NA, 1e-10, 3.01), `other "column`=c('foo bar', NA, 'quote" and \\ bs')), f, quote=T, qmethod="double") - }, '"a","other ""column"\n,"foo bar"\n1e-10,""\n3.01,"quote"" and \\ bs"\n') + }, '"a","other ""column"\n,"foo bar"\n1e-10,\n3.01,"quote"" and \\ bs"\n') # changing sep fwrite_test(1640.4, function(f) { fwrite(data.table(a="foo", b="ba\"r"), f, sep=";") }, @@ -8443,6 +8443,10 @@ local({ fwrite_test(1640.5, function(f) { fwrite(data.table(a="foo", b="bar"), f, eol="\r\n") }, '"a","b"\r\n"foo","bar"\r\n') + # changing NA + fwrite_test(1640.51, function(f) { fwrite(data.table(a=c("foo", NA), b=c(1, NA)), f, na="NA") }, + '"a","b"\n"foo",1\nNA,NA\n') + # no col.names fwrite_test(1640.6, function(f) { fwrite(data.table(a="foo", b="bar"), f, col.names=F) }, '"foo","bar"\n') @@ -8475,10 +8479,14 @@ local({ fwrite_test(1640.13, function(f) { fwrite(data.frame(a=c(1,2,3)), f) }, '"a"\n1\n2\n3\n') - # factor columns + # different column types fwrite_test(1640.14, function(f) { - fwrite(data.table(a=as.factor(c('foo', 'bar')), b=as.factor(c(NA, "baz"))), f) - }, '"a","b"\n"foo",""\n"bar","baz"\n') + fwrite(data.table( + factor1=as.factor(c('foo', 'bar')), + factor2=as.factor(c(NA, "baz")), + bool=c(TRUE,NA), + ints=as.integer(c(NA, 5))), f, na='na')}, + '"factor1","factor2","bool","ints"\n"foo",na,TRUE,na\n"bar","baz",na,5\n') # empty data table (headers but no rows) empty_dt <- data.table(a=1, b=2)[0,] diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 254a8a18ec..ba640790fd 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -5,7 +5,7 @@ Similar to \code{write.table} but faster and more limited in features. } \usage{ -fwrite(x, file.path, append = FALSE, quote = TRUE, sep = ",", eol = "\n", +fwrite(x, file.path, append = FALSE, quote = TRUE, sep = ",", eol = "\n", na = "", col.names = TRUE, qmethod = "double", block.size = 10000) } \arguments{ @@ -15,14 +15,13 @@ fwrite(x, file.path, append = FALSE, quote = TRUE, sep = ",", eol = "\n", \item{quote}{If \code{TRUE}, all columns of character and factor types, as well as all column names, will be surrounded by double quotes. If \code{FALSE}, nothing is quoted, even if this would break the CSV (the column contents are not checked for separator characters).} \item{sep}{The separator between columns} \item{eol}{Line separator} + \item{na}{The string to use for missing values in the data} \item{col.names}{A logical value indicating if the column names (header row) should be written} \item{qmethod}{A character string specifying how to deal with embedded double quote characters when quoting strings. Must be one of "escape", in which case the quote character (as well as the backslash character) is escaped in C style by a backslash, or "double" (default), in which case it is doubled.} \item{block.size}{The output is written in blocks, each of which contains at most this number of rows. This is to avoid making large copies in memory. Can be used to tweak performance and memory usage.} } \details{ -\code{NA} values are always represented by empty strings, quoted for character and factor columns if \code{quote=TRUE}. - -The speed-up compared to \code{write.csv} depends on the parameters and column types. This version should be significantly faster if the table consists mainly of character columns. In addition, improved performance can be expected if one can make do with \code{quote=FALSE}. +The speed-up compared to \code{write.csv} depends on the parameters and column types. } \seealso{ \code{\link[utils]{write.csv}} } \examples{ diff --git a/src/fwrite.c b/src/fwrite.c index 108810fb14..8e63df8c12 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -6,6 +6,7 @@ void writefile(SEXP list_of_columns, SEXP filename, SEXP col_sep_exp, SEXP row_sep_exp, + SEXP na_exp, SEXP quote_cols, SEXP qmethod_escape_exp, SEXP append) { @@ -17,6 +18,7 @@ void writefile(SEXP list_of_columns, char col_sep = *CHAR(STRING_ELT(col_sep_exp, 0)); const char *row_sep = CHAR(STRING_ELT(row_sep_exp, 0)); + const char *na_str = CHAR(STRING_ELT(na_exp, 0)); const char QUOTE_CHAR = '"'; const char ESCAPE_CHAR = '\\'; @@ -34,20 +36,38 @@ void writefile(SEXP list_of_columns, if (col_i > 0) fputc(col_sep, f); - int quote = LOGICAL(quote_cols)[col_i]; + SEXP column = VECTOR_ELT(list_of_columns, col_i); - if (quote) fputc(QUOTE_CHAR, f); - for (const char *ch = CHAR(STRING_ELT(VECTOR_ELT(list_of_columns, col_i), row_i)); *ch != '\0'; ++ch) { - if (quote) { - if (*ch == QUOTE_CHAR) { - if (qmethod_escape) fputc(ESCAPE_CHAR, f); - else fputc(QUOTE_CHAR, f); /* qmethod = "double" */ + switch(TYPEOF(column)) { + case INTSXP: + if (INTEGER(column)[row_i] == NA_INTEGER) fputs(na_str, f); + else fprintf(f, "%d", INTEGER(column)[row_i]); + break; + + case REALSXP: + if (ISNA(REAL(column)[row_i])) fputs(na_str, f); + else fprintf(f, "%g", REAL(column)[row_i]); + break; + + default: /* assuming STRSXP */ + if (STRING_ELT(column, row_i) == NA_STRING) fputs(na_str, f); + else { + int quote = LOGICAL(quote_cols)[col_i]; + if (quote) fputc(QUOTE_CHAR, f); + for (const char *ch = CHAR(STRING_ELT(column, row_i)); *ch != '\0'; ++ch) { + if (quote) { + if (*ch == QUOTE_CHAR) { + if (qmethod_escape) fputc(ESCAPE_CHAR, f); + else fputc(QUOTE_CHAR, f); /* qmethod = "double" */ + } + if (qmethod_escape && *ch == ESCAPE_CHAR) fputc(ESCAPE_CHAR, f); + } + fputc(*ch, f); } - if (qmethod_escape && *ch == ESCAPE_CHAR) fputc(ESCAPE_CHAR, f); + if (quote) fputc(QUOTE_CHAR, f); } - fputc(*ch, f); + break; } - if (quote) fputc(QUOTE_CHAR, f); } if (fputs(row_sep, f) < 0) goto end; } From ff5ec810e74834f67c71d4fdf5e739404a51dac0 Mon Sep 17 00:00:00 2001 From: Otto Seiskari Date: Mon, 28 Mar 2016 18:40:18 +0300 Subject: [PATCH 4/6] changed fwrite test to allow different floating point notations from fprintf, e.g., 1e-10 (Linux) = 1e-010 (Windows) --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 45cddc4515..e8448f6ef1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8430,10 +8430,10 @@ local({ # with quoting and qmethod="double" (default) fwrite_test(1640.3, function(f) { fwrite(data.table( - a=c(NA, 1e-10, 3.01), + a=c(NA, 1.2e-100, 3.01), `other "column`=c('foo bar', NA, 'quote" and \\ bs')), f, quote=T, qmethod="double") - }, '"a","other ""column"\n,"foo bar"\n1e-10,\n3.01,"quote"" and \\ bs"\n') + }, '"a","other ""column"\n,"foo bar"\n1.2e-100,\n3.01,"quote"" and \\ bs"\n') # changing sep fwrite_test(1640.4, function(f) { fwrite(data.table(a="foo", b="ba\"r"), f, sep=";") }, From 1b79ec4d228284c04140bb9eb0e7e6169a7afbab Mon Sep 17 00:00:00 2001 From: Otto Seiskari Date: Mon, 28 Mar 2016 21:10:55 +0300 Subject: [PATCH 5/6] using path.expand on the file.path argument of fwrite --- R/fwrite.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/fwrite.R b/R/fwrite.R index 0571737b57..ecf90364cc 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -14,6 +14,9 @@ fwrite <- function(x, file.path, append = FALSE, quote = TRUE, stopifnot(length(append) == 1 && class(append) == "logical") stopifnot(length(block.size) == 1 && block.size > 0) + # handle paths like "~/foo/bar" + file.path <- path.expand(file.path) + quoted_cols <- rep(quote, ncol(x)) # special case: single-column data.frame, doing x[block_begin:block_end,] From e8c50d1e7a37d65b1df6d7957d8bd85ac052d85d Mon Sep 17 00:00:00 2001 From: Otto Seiskari Date: Tue, 29 Mar 2016 19:28:37 +0300 Subject: [PATCH 6/6] increased number of significant digits to 15 in fwrite --- inst/tests/tests.Rraw | 4 ++++ src/fwrite.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e8448f6ef1..5556950170 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8494,6 +8494,10 @@ local({ # data.table with duplicate column names fwrite_test(1640.151, function(f) {fwrite(data.table(a=1, a=2), f)}, '"a","a"\n1,2\n') + + # number of significant digits = 15 + fwrite_test(1640.152, function(f) {fwrite(data.table(a=1/0.9), f)}, + '"a"\n1.11111111111111\n') # test append f <- tempfile() diff --git a/src/fwrite.c b/src/fwrite.c index 8e63df8c12..5a50d8d343 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -46,7 +46,7 @@ void writefile(SEXP list_of_columns, case REALSXP: if (ISNA(REAL(column)[row_i])) fputs(na_str, f); - else fprintf(f, "%g", REAL(column)[row_i]); + else fprintf(f, "%.15g", REAL(column)[row_i]); break; default: /* assuming STRSXP */