diff --git a/NAMESPACE b/NAMESPACE index 45dd8f8269..3e6b1bd85d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(setNumericRounding, getNumericRounding) export(chmatch, "%chin%", chorder, chgroup) export(rbindlist) export(fread) +export(fwrite) export(foverlaps) export(shift) export(transpose) diff --git a/R/fwrite.R b/R/fwrite.R new file mode 100644 index 0000000000..ecf90364cc --- /dev/null +++ b/R/fwrite.R @@ -0,0 +1,65 @@ +fwrite <- function(x, file.path, append = FALSE, quote = TRUE, + sep = ",", eol = "\n", na = "", col.names = TRUE, qmethod = "double", + block.size = 10000) { + + # validate arguments + stopifnot(is.data.frame(x)) + stopifnot(ncol(x) > 0) + + stopifnot(length(quote) == 1 && class(quote) == "logical") + stopifnot(length(sep) == 1 && class(sep) == "character" && nchar(sep) == 1) + stopifnot(length(eol) == 1 && class(eol) == "character") + stopifnot(length(qmethod) == 1 && qmethod %in% c("double", "escape")) + stopifnot(length(col.names) == 1 && class(col.names) == "logical") + stopifnot(length(append) == 1 && class(append) == "logical") + stopifnot(length(block.size) == 1 && block.size > 0) + + # handle paths like "~/foo/bar" + file.path <- path.expand(file.path) + + quoted_cols <- rep(quote, ncol(x)) + + # special case: single-column data.frame, doing x[block_begin:block_end,] + # for such data frame gives a vector + if (!is.data.table(x) && ncol(x) == 1) x <- as.data.table(x) + + # write header row separately for correct quoting of row names + if (col.names && !append) { + .Call(Cwritefile, as.list(names(x)), file.path, sep, eol, na, quoted_cols, qmethod == "escape", append) + append <- TRUE + } + + # handle empty x + if (nrow(x) == 0) return() + + # determine from column types, which ones should be quoted + if (quote) { + column_types <- sapply(x, class) + quoted_cols <- column_types %chin% c('character', 'factor') + } + + # write in blocks of given size to avoid generating full copies + # of columns in memory + block_begin <- 1 + + repeat { + block_end <- min(block_begin+(block.size-1), nrow(x)) + + dt_block <- x[c(block_begin:block_end),] + + # convert data.frame row block to a list of columns + col_list <- lapply(dt_block, function(column) { + if (!(class(column) %chin% c('integer', 'numeric', 'character'))) { + column <- as.character(column) + } + column + }) + + .Call(Cwritefile, col_list, file.path, sep, eol, na, quoted_cols, qmethod == "escape", append) + + if (block_end == nrow(x)) break + + append <- TRUE + block_begin <- block_end+1 + } +} \ No newline at end of file diff --git a/README.md b/README.md index c0d6d65e3f..0d2da99a99 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,8 @@ 29. New `split` method for data.table. Faster, more flexible and consistent with data.frame method. Closes [#1389](https://github.com/Rdatatable/data.table/issues/1389). 30. x's columns can be referred to in `j` using the prefix `x.` at all times. This is particularly useful when it is necessary to x's column that is *also a join column*. This is a patch addressing [#1615](https://github.com/Rdatatable/data.table/issues/1615). + + 31. New function `fwrite`. Fixes [#580](https://github.com/Rdatatable/data.table/issues/580). Thanks @oseiskar. #### BUG FIXES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b44bd03bef..ea148fabd8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8412,6 +8412,133 @@ y <- data.table(bb = 3:5, dd = 3:1) test(1640.1, x[y, x.aa, on=c(aa="bb")], INT(3,NA,NA)) test(1640.2, x[y, c(.SD, .(x.aa=x.aa)), on=c(aa="bb")], data.table(aa=3:5, cc=c("c", NA,NA), x.aa=INT(3,NA,NA))) +# fwrite tests +local({ + fwrite_test <- function(number, writer, expected_result) { + f <- tempfile() + writer(f) + result <- readChar(f, file.info(f)$size) + unlink(f) + test(number, result, expected_result) + } + + # without quoting + fwrite_test(1641.1, function(f) { + fwrite(data.table(a=c(NA, 2, 3.01), b=c('foo', NA, 'bar')), f, quote=F) + }, 'a,b\n,foo\n2,\n3.01,bar\n') + + # with quoting and qmethod="escape" + fwrite_test(1641.2, function(f) { + fwrite(data.table( + a=c(NA, 2, 3.01), + `other column`=c('foo bar', NA, 'quote" and \\ bs \n and newline')), + f, quote=T, qmethod="escape") + }, '"a","other column"\n,"foo bar"\n2,\n3.01,"quote\\" and \\\\ bs \n and newline"\n') + + # with quoting and qmethod="double" (default) + fwrite_test(1641.3, function(f) { + fwrite(data.table( + a=c(NA, 1.2e-100, 3.01), + `other "column`=c('foo bar', NA, 'quote" and \\ bs')), + f, quote=T, qmethod="double") + }, '"a","other ""column"\n,"foo bar"\n1.2e-100,\n3.01,"quote"" and \\ bs"\n') + + # changing sep + fwrite_test(1641.4, function(f) { fwrite(data.table(a="foo", b="ba\"r"), f, sep=";") }, + '"a";"b"\n"foo";"ba""r"\n') + + # changing eol + fwrite_test(1641.5, function(f) { fwrite(data.table(a="foo", b="bar"), f, eol="\r\n") }, + '"a","b"\r\n"foo","bar"\r\n') + + # changing NA + fwrite_test(1641.51, function(f) { fwrite(data.table(a=c("foo", NA), b=c(1, NA)), f, na="NA") }, + '"a","b"\n"foo",1\nNA,NA\n') + + # no col.names + fwrite_test(1641.6, function(f) { fwrite(data.table(a="foo", b="bar"), f, col.names=F) }, + '"foo","bar"\n') + + # small block size to assert that blocking works correctly + fwrite_test(1641.7, function(f) { fwrite(data.table(a=c(1:5), b=c(1:5)), f, block.size=2) }, + '"a","b"\n1,1\n2,2\n3,3\n4,4\n5,5\n') + + # block size equal to number of rows + fwrite_test(1641.8, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=3) }, + '"a","b"\n1,1\n2,2\n3,3\n') + + # block size one bigger than number of rows + fwrite_test(1641.9, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=4) }, + '"a","b"\n1,1\n2,2\n3,3\n') + + # block size one less than number of rows + fwrite_test(1641.10, function(f) { fwrite(data.table(a=c(1:3), b=c(1:3)), f, block.size=2) }, + '"a","b"\n1,1\n2,2\n3,3\n') + + # writing a data.frame + fwrite_test(1641.11, function(f) { fwrite(data.frame(a="foo", b="bar"), f) }, + '"a","b"\n"foo","bar"\n') + + # single-column data.table + fwrite_test(1641.12, function(f) { fwrite(data.table(a=c(1,2,3)), f) }, + '"a"\n1\n2\n3\n') + + # single-column data.frame + fwrite_test(1641.13, function(f) { fwrite(data.frame(a=c(1,2,3)), f) }, + '"a"\n1\n2\n3\n') + + # different column types + fwrite_test(1641.14, function(f) { + fwrite(data.table( + factor1=as.factor(c('foo', 'bar')), + factor2=as.factor(c(NA, "baz")), + bool=c(TRUE,NA), + ints=as.integer(c(NA, 5))), f, na='na')}, + '"factor1","factor2","bool","ints"\n"foo",na,TRUE,na\n"bar","baz",na,5\n') + + # empty data table (headers but no rows) + empty_dt <- data.table(a=1, b=2)[0,] + fwrite_test(1641.15, function(f) { fwrite(empty_dt, f) }, '"a","b"\n') + + # data.table with duplicate column names + fwrite_test(1641.151, function(f) {fwrite(data.table(a=1, a=2), f)}, '"a","a"\n1,2\n') + + # number of significant digits = 15 + fwrite_test(1641.152, function(f) {fwrite(data.table(a=1/0.9), f)}, + '"a"\n1.11111111111111\n') + + # test append + f <- tempfile() + fwrite(data.table(a=c(1,2), b=c('a', 'b')), f) + fwrite(data.table(a=c(3,4), b=c('c', 'd')), f, append=T) + res <- readChar(f, file.info(f)$size) + unlink(f) + test(1641.16, res, '"a","b"\n1,"a"\n2,"b"\n3,"c"\n4,"d"\n') + + # simple data table (reference for the error cases below) + ok_dt <- data.table(foo="bar") + fwrite_test(1641.17, function(f) { fwrite(ok_dt, f) }, '"foo"\n"bar"\n') + + # error cases + fwrite_expect_error <- function(test_number, writer) { + f <- tempfile() + was_error <- F + tryCatch(writer(f), error=function(e) { was_error <<- T }) + test(test_number, TRUE, was_error) + stopifnot(!file.exists(f)) + } + + # wrong argument types + fwrite_expect_error(1641.18, function(f) {fwrite(ok_dt, 1)}) + fwrite_expect_error(1641.19, function(f) {fwrite(ok_dt, f, quote=123)}) + fwrite_expect_error(1641.20, function(f) {fwrite(ok_dt, f, sep="...")}) + fwrite_expect_error(1641.21, function(f) {fwrite(ok_dt, f, qmethod=c("double", "double"))}) + fwrite_expect_error(1641.22, function(f) {fwrite(ok_dt, f, col.names="foobar")}) + + # null data table (no columns) + fwrite_expect_error(1641.24, function(f) {fwrite(data.table(a=1)[NULL,], f)}) +}) + ########################## # TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time. diff --git a/man/fwrite.Rd b/man/fwrite.Rd new file mode 100644 index 0000000000..ba640790fd --- /dev/null +++ b/man/fwrite.Rd @@ -0,0 +1,40 @@ +\name{fwrite} +\alias{fwrite} +\title{Fast CSV writer} +\description{ + Similar to \code{write.table} but faster and more limited in features. +} +\usage{ +fwrite(x, file.path, append = FALSE, quote = TRUE, sep = ",", eol = "\n", na = "", + col.names = TRUE, qmethod = "double", block.size = 10000) +} +\arguments{ + \item{x}{The \code{data.table} or \code{data.frame} to write} + \item{file.path}{Output file name} + \item{append}{If \code{TRUE}, the file is opened in append mode and column names (header row) are not written.} + \item{quote}{If \code{TRUE}, all columns of character and factor types, as well as all column names, will be surrounded by double quotes. If \code{FALSE}, nothing is quoted, even if this would break the CSV (the column contents are not checked for separator characters).} + \item{sep}{The separator between columns} + \item{eol}{Line separator} + \item{na}{The string to use for missing values in the data} + \item{col.names}{A logical value indicating if the column names (header row) should be written} + \item{qmethod}{A character string specifying how to deal with embedded double quote characters when quoting strings. Must be one of "escape", in which case the quote character (as well as the backslash character) is escaped in C style by a backslash, or "double" (default), in which case it is doubled.} + \item{block.size}{The output is written in blocks, each of which contains at most this number of rows. This is to avoid making large copies in memory. Can be used to tweak performance and memory usage.} +} +\details{ +The speed-up compared to \code{write.csv} depends on the parameters and column types. +} +\seealso{ \code{\link[utils]{write.csv}} } +\examples{ +\dontrun{ + +fwrite(data.table(first=c(1,2), second=c(NA, 'foo"bar')), "table.csv") + +# table.csv contains: + +# "first","second" +# "1","" +# "2","foo""bar" +} +} +\keyword{ data } + diff --git a/src/fwrite.c b/src/fwrite.c new file mode 100644 index 0000000000..5a50d8d343 --- /dev/null +++ b/src/fwrite.c @@ -0,0 +1,79 @@ +#include +#include +#include + +void writefile(SEXP list_of_columns, + SEXP filename, + SEXP col_sep_exp, + SEXP row_sep_exp, + SEXP na_exp, + SEXP quote_cols, + SEXP qmethod_escape_exp, + SEXP append) { + + int error_number = 0; + int qmethod_escape = *LOGICAL(qmethod_escape_exp); + + errno = 0; /* clear flag possibly set by previous errors */ + + char col_sep = *CHAR(STRING_ELT(col_sep_exp, 0)); + const char *row_sep = CHAR(STRING_ELT(row_sep_exp, 0)); + const char *na_str = CHAR(STRING_ELT(na_exp, 0)); + const char QUOTE_CHAR = '"'; + const char ESCAPE_CHAR = '\\'; + + /* open input file in correct mode */ + const char *open_mode = "wb"; + if (*LOGICAL(append)) open_mode = "ab"; + FILE *f = fopen(CHAR(STRING_ELT(filename, 0)), open_mode); + if (f == NULL) goto end; + + R_xlen_t ncols = LENGTH(list_of_columns); + R_xlen_t nrows = LENGTH(VECTOR_ELT(list_of_columns, 0)); + + for (R_xlen_t row_i = 0; row_i < nrows; ++row_i) { + for (int col_i = 0; col_i < ncols; ++col_i) { + + if (col_i > 0) fputc(col_sep, f); + + SEXP column = VECTOR_ELT(list_of_columns, col_i); + + switch(TYPEOF(column)) { + case INTSXP: + if (INTEGER(column)[row_i] == NA_INTEGER) fputs(na_str, f); + else fprintf(f, "%d", INTEGER(column)[row_i]); + break; + + case REALSXP: + if (ISNA(REAL(column)[row_i])) fputs(na_str, f); + else fprintf(f, "%.15g", REAL(column)[row_i]); + break; + + default: /* assuming STRSXP */ + if (STRING_ELT(column, row_i) == NA_STRING) fputs(na_str, f); + else { + int quote = LOGICAL(quote_cols)[col_i]; + if (quote) fputc(QUOTE_CHAR, f); + for (const char *ch = CHAR(STRING_ELT(column, row_i)); *ch != '\0'; ++ch) { + if (quote) { + if (*ch == QUOTE_CHAR) { + if (qmethod_escape) fputc(ESCAPE_CHAR, f); + else fputc(QUOTE_CHAR, f); /* qmethod = "double" */ + } + if (qmethod_escape && *ch == ESCAPE_CHAR) fputc(ESCAPE_CHAR, f); + } + fputc(*ch, f); + } + if (quote) fputc(QUOTE_CHAR, f); + } + break; + } + } + if (fputs(row_sep, f) < 0) goto end; + } + + end: + error_number = errno; + if (f != NULL) fclose(f); + if (error_number) error(strerror(errno)); +} \ No newline at end of file diff --git a/src/init.c b/src/init.c index 7d329e681a..0bc007cfec 100644 --- a/src/init.c +++ b/src/init.c @@ -17,6 +17,7 @@ SEXP setcharvec(); SEXP setcolorder(); SEXP chmatchwrapper(); SEXP readfile(); +SEXP writefile(); SEXP reorder(); SEXP rbindlist(); SEXP vecseq(); @@ -87,6 +88,7 @@ R_CallMethodDef callMethods[] = { {"Csetcolorder", (DL_FUNC) &setcolorder, -1}, {"Cchmatchwrapper", (DL_FUNC) &chmatchwrapper, -1}, {"Creadfile", (DL_FUNC) &readfile, -1}, +{"Cwritefile", (DL_FUNC) &writefile, -1}, {"Creorder", (DL_FUNC) &reorder, -1}, {"Crbindlist", (DL_FUNC) &rbindlist, -1}, {"Cvecseq", (DL_FUNC) &vecseq, -1},