Skip to content

Commit

Permalink
fread quote="" when last line too short and fill=TRUE (#3524)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdowle authored Apr 25, 2019
1 parent 83628ca commit b600b75
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 13 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@

4. `fwrite()` could crash when writing very long strings such as 30 million characters, [#2974](https://github.com/Rdatatable/data.table/issues/2974), and could be unstable in memory constrained environments, [#2612](https://github.com/Rdatatable/data.table/issues/2612). Thanks to @logworthy and @zachokeeffe for reporting and Philippe Chataignon for fixing in PR [#3288](https://github.com/Rdatatable/data.table/pull/3288).

5. `fread()` could crash if `quote=""` (i.e. ignore quotes), the last line is too short, and `fill=TRUE`, [#3524](https://github.com/Rdatatable/data.table/pull/3524). Thanks to Jiucang Hao for the report and reproducible example.

#### NOTES

1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.
Expand Down
Binary file added inst/tests/noquote.csv.gz
Binary file not shown.
10 changes: 10 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -14072,6 +14072,16 @@ test(2027.1, DT[, list(1, ), by=a], error = 'Item 2 of the .() or list() passed
test(2027.2, DT[, list(1,2,)], error = 'Item 3 of the .() or list() passed to j is missing')
test(2027.3, DT[, .(1,,3,), by=a], error = 'Item 2 of the .() or list() passed to j is missing')

# fread quote="" when last line too short and filled with fill=TRUE (provided via email), crash in v1.12.2 and before
test(2028.1, fread(testDir("noquote.csv.gz"), fill=TRUE, quote="")[c(1,.N), c(1,2,9,10)],
data.table(H=c("D","T"), "Locate Reply"=c("BCS","Locate Reply"), V9=c("A",""), V10=c("4/23/2019 7:11:11 AM","")))
test(2028.2, fread(testDir("noquote.csv.gz"), fill=TRUE, quote="", header=FALSE)[c(1,.N), c(1,2,3,8,9,10)],
data.table(V1=c("H","T"), V2="Locate Reply", V3=c("GORLTR","2093"), V8=INT(NA,NA), V9="", V10=""))
txt = "A,B,C\n1,4,7\n2,5,8\n3,6\n"
test(2029.1, fread(txt), data.table(A=1:2, B=4:5, C=7:8), warning="Discarded single-line footer: <<3,6>>")
test(2029.2, fread(txt, quote=""), data.table(A=1:2, B=4:5, C=7:8), warning="Discarded single-line footer: <<3,6>>")
test(2029.3, fread(txt, quote="", fill=TRUE), data.table(A=1:3, B=4:6, C=c(7:8,NA)))


###################################
# Add new tests above this line #
Expand Down
29 changes: 16 additions & 13 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,11 @@ static char *typesAsString(int ncol) {
static char str[101];
int i=0;
if (ncol<=100) {
for (; i<ncol; i++) str[i] = typeLetter[type[i]];
for (; i<ncol; i++) str[i] = typeLetter[abs(type[i])]; // abs for out-of-sample type bumps (negative)
} else {
for (; i<80; i++) str[i] = typeLetter[type[i]];
for (; i<80; i++) str[i] = typeLetter[abs(type[i])];
str[i++]='.'; str[i++]='.'; str[i++]='.';
for (int j=ncol-10; j<ncol; j++) str[i++] = typeLetter[type[j]];
for (int j=ncol-10; j<ncol; j++) str[i++] = typeLetter[abs(type[j])];
}
str[i] = '\0';
return str;
Expand Down Expand Up @@ -495,7 +495,7 @@ static void Field(FieldParseContext *ctx)
if ((*ch==' ' && stripWhite) || (*ch=='\0' && ch<eof))
while(*++ch==' ' || (*ch=='\0' && ch<eof)); // if sep==' ' the space would have been skipped already and we wouldn't be on space now.
const char *fieldStart=ch;
if (*ch!=quote || quoteRule==3) {
if (*ch!=quote || quoteRule==3 || quote=='\0') {
// Most common case. Unambiguously not quoted. Simply search for sep|eol. If field contains sep|eol then it should have been quoted and we do not try to heal that.
while(!end_of_field(ch)) ch++; // sep, \r, \n or eof will end
*(ctx->ch) = ch;
Expand Down Expand Up @@ -562,7 +562,7 @@ static void Field(FieldParseContext *ctx)
}
target->len = (int32_t)(ch - fieldStart);
target->off = (int32_t)(fieldStart - ctx->anchor);
if (*ch==quote) {
if (*ch==quote) { // quote=='\0' (user set quote="") would have returned earlier above in the same branch as quoteRule 3
ch++;
skip_white(&ch);
*(ctx->ch) = ch;
Expand Down Expand Up @@ -798,7 +798,7 @@ static void parse_double_extended(FieldParseContext *ctx)
double *target = (double*) ctx->targets[sizeof(double)];
bool neg, quoted;
init();
ch += (quoted = (*ch=='"'));
ch += (quoted = (*ch==quote && quote));
ch += (neg = (*ch=='-')) + (*ch=='+');

if (ch[0]=='n' && ch[1]=='a' && ch[2]=='n' && (ch += 3)) goto return_nan;
Expand Down Expand Up @@ -843,7 +843,7 @@ static void parse_double_extended(FieldParseContext *ctx)
return_na:
*target = NA_FLOAT64;
ok:
if (quoted && *ch!='"') {
if (quoted && *ch!=quote) {
*target = NA_FLOAT64;
} else {
*(ctx->ch) = ch + quoted;
Expand Down Expand Up @@ -1048,7 +1048,7 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped
// thread at headPos which has full lineage to sof may bump the quoteRule.
break; // caller will detect this line hasn't finished properly
}
if (*ch==quote) {
if (*ch==quote && quote) { // && quote to exclude quote='\0' (user passed quote="")
ch++;
fun[tmpType[field]](&fctx);
if (*ch==quote) {
Expand Down Expand Up @@ -1169,6 +1169,8 @@ int freadMain(freadMainArgs _args) {
if (dec=='\0') STOP("dec='' not allowed. Should be '.' or ','");
if (args.sep == dec) STOP("sep == dec ('%c') is not allowed", dec);
if (quote == dec) STOP("quote == dec ('%c') is not allowed", dec);
// since quote=='\0' when user passed quote="", the logic in this file uses '*ch==quote && quote' otherwise
// the ending \0 at eof could be treated as a quote (test xxx)

// File parsing context: pointer to the start of file, and to the end of
// the file. The `sof` pointer may be shifted in order to skip over
Expand Down Expand Up @@ -1459,7 +1461,7 @@ int freadMain(freadMainArgs _args) {
int topSkip=0; // how many rows to auto-skip
const char *topStart=NULL;

for (quoteRule=0; quoteRule<4; quoteRule++) {
for (quoteRule=quote?0:3; quoteRule<4; quoteRule++) {
// quote rule in order of preference.
// when top is tied the first wins, so do all seps for the first quoteRule, then all seps for the second quoteRule, etc
for (int s=0; s<nseps; s++) {
Expand All @@ -1469,7 +1471,7 @@ int freadMain(freadMainArgs _args) {
// if (verbose) DTPRINT(" Trying sep='%c' with quoteRule %d ...\n", sep, quoteRule);

if (fill) {
if (quoteRule>1) continue; // turn off self-healing quote rules when filling
if (quoteRule>1 && quote) continue; // turn off self-healing quote rule when filling
int firstRowNcol = countfields(&ch);
int thisncol=0, maxncol=firstRowNcol, thisRow=0;
while (ch<eof && ++thisRow<jumpLines) { // TODO: rename 'jumpLines' to 'jumpRows'
Expand Down Expand Up @@ -1562,7 +1564,7 @@ int freadMain(freadMainArgs _args) {
}

quoteRule = topQuoteRule;
if (quoteRule>1) {
if (quoteRule>1 && quote) {
DTWARN("Found and resolved improper quoting in first %d rows. If the fields are not quoted (e.g. field separator does not appear within any field), try quote=\"\" to avoid this warning.", jumpLines);
// TODO: include line number and text in warning. Could loop again with the standard quote rule to find the line that fails.
}
Expand Down Expand Up @@ -2172,10 +2174,10 @@ int freadMain(freadMainArgs _args) {
tch = end_NA_string(tch);
skip_white(&tch);
if (!end_of_field(tch)) tch = afterSpace; // else it is the field_end, we're on closing sep|eol and we'll let processor write appropriate NA as if field was empty
if (*tch==quote) { quoted=true; tch++; }
if (*tch==quote && quote) { quoted=true; tch++; }
} // else Field() handles NA inside it unlike other processors e.g. ,, is interpretted as "" or NA depending on option read inside Field()
fun[abs(thisType)](&fctx);
if (quoted) {
if (quoted) { // quoted was only set to true with '&& quote' above (=> quote!='\0' now)
if (*tch==quote) tch++;
else goto typebump;
}
Expand Down Expand Up @@ -2362,6 +2364,7 @@ int freadMain(freadMainArgs _args) {
for (int i=0; i<ncol; i++) typeCounts[ abs(type[i]) ]++;

if (nTypeBump) {
if (verbose) DTPRINT(" %d out-of-sample type bumps: %s\n", nTypeBump, typesAsString(ncol));
rowSize1 = rowSize4 = rowSize8 = 0;
nStringCols = 0;
nNonStringCols = 0;
Expand Down

0 comments on commit b600b75

Please sign in to comment.