diff --git a/R/fread.R b/R/fread.R index 876be4f966..92c191231b 100644 --- a/R/fread.R +++ b/R/fread.R @@ -74,6 +74,25 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str if ( integer64=="integer64" && !exists("print.integer64") && any(sapply(ans,inherits,"integer64")) ) warning("Some columns have been read as type 'integer64' but package bit64 isn't loaded. Those columns will display as strange looking floating point data. There is no need to reload the data. Just require(bit64) to obtain the integer64 print method and print the data again.") setattr(ans,"row.names",.set_row_names(nr)) + as_factor <- function(x) { + lev = forderv(x, retGrp = TRUE) + # get levels, also take care of all sorted condition + if (length(lev)) lev = x[lev[attributes(lev)$starts]] + else lev = x[attributes(lev)$starts] + ans = chmatch(x, lev) + setattr(ans, 'levels', lev) + setattr(ans, 'class', 'factor') + } + if (isTRUE(as.logical(stringsAsFactors))) { + setattr(ans,"class",c("data.table","data.frame")) + idx = which(vapply(ans, is.character, TRUE)) + if (length(idx)) { + if (verbose) + cat("Converting char column(s) [", paste(names(ans)[idx], collapse=", "), "] to factors", sep="") + for (j in idx) + set(ans, i = NULL, j = j, value = as_factor(ans[[j]])) + } + } if (isTRUE(data.table)) { setattr(ans,"class",c("data.table","data.frame")) return(alloc.col(ans)) @@ -82,5 +101,3 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str return(ans) } } - - diff --git a/README.md b/README.md index a673eb0f1c..1982b2ac2e 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,8 @@ 23. `CJ` gains logical `unique` argument with default `FALSE`. If `TRUE`, unique values of vectors are automatically computed and used. This is convenient, for example, `DT[CJ(a, b, c, unique=TRUE)]` instead of doing `DT[CJ(unique(a), unique(b), unique(c))]`. Ultimately, `unique = TRUE` will be default. Closes [#1148](https://github.com/Rdatatable/data.table/issues/1148). + 24. Implemented `stringsAsFactors` argument for `fread()`. When `TRUE`, character columns are converted to factors. Default is `FALSE`. Thanks to Artem Klevtsov for filing [#501](https://github.com/Rdatatable/data.table/issues/501), and to @hmi2015 for [this SO post](http://stackoverflow.com/q/31350209/559784). + #### BUG FIXES 1. `if (TRUE) DT[,LHS:=RHS]` no longer prints, [#869](https://github.com/Rdatatable/data.table/issues/869) and [#1122](https://github.com/Rdatatable/data.table/issues/1122). Tests added. To get this to work we've had to live with one downside: if a `:=` is used inside a function with no `DT[]` before the end of the function, then the next time `DT` or `print(DT)` is typed at the prompt, nothing will be printed. A repeated `DT` or `print(DT)` will print. To avoid this: include a `DT[]` after the last `:=` in your function. If that is not possible (e.g., it's not a function you can change) then `DT[]` at the prompt is guaranteed to print. As before, adding an extra `[]` on the end of a `:=` query is a recommended idiom to update and then print; e.g. `> DT[,foo:=3L][]`. Thanks to Jureiss and Jan Gorecki for reporting. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dbe39e61e0..95cacdd7ff 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6430,6 +6430,19 @@ setDT(DT, key = "A") test(1526.1, key(DT), "A") test(1526.2, key(setDT(DT, key = NULL)), NULL) +# #501, fread stringsAsFactors=FALSE +dt = data.table(x=1:5, y = letters[1:5]) +text = "x,y\n1,a\n2,b\n3,c\n4,d\n5,e\n" +test(1527.1, dt[, y := factor(y)], fread(text, stringsAsFactors=TRUE)) +set.seed(1L) +dt = data.table(x=1:5, y = sample(letters[1:5])) +text = "x,y\n1,b\n2,e\n3,d\n4,c\n5,a\n" +test(1527.2, dt[, y := factor(y)], fread(text, stringsAsFactors=TRUE)) +set.seed(1L) +dt = data.table(x=1:5, y = sample(letters[1:2], 5, TRUE)) +text = "x,y\n1,a\n2,a\n3,b\n4,b\n5,a\n" +test(1527.3, dt[, y := factor(y)], fread(text, stringsAsFactors=TRUE)) + ##########################