Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fread blank.lines.skip gains new value "none" #6158

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.")
}
stopifnot(
isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill) || is.numeric(fill) && length(fill)==1L && fill >= 0L, isTRUEorFALSE(showProgress),
isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip) || (blank.lines.skip == "none"), isTRUEorFALSE(fill) || is.numeric(fill) && length(fill)==1L && fill >= 0L, isTRUEorFALSE(showProgress),
isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml),
isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0),
is.numeric(nrows), length(nrows)==1L
Expand Down
12 changes: 12 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -18612,3 +18612,15 @@ test(2263.3, options=list(datatable.verbose=TRUE, datatable.optimize=0L), names(
test(2263.4, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce optimized j to")
test(2263.5, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce optimized j to")
test(2263.6, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce optimized j to")

file = tempfile()
writeLines(c("", "", "A"), file)
test(2264.1, fread(file, blank.lines.skip="none"), data.table(V1 = c("", "", "A")))
writeLines(c("", "", "A", "", "B"), file)
test(2264.2, fread(file, blank.lines.skip="none"), data.table(V1 = c("", "", "A", "", "B")))
writeLines(c("", "", NA), file)
test(2264.3, fread(file, blank.lines.skip="none"), data.table(V1 = c(NA, NA, NA)))
writeLines(c("a,b", ",", ","), file)
test(2264.4, fread(file, blank.lines.skip="none"), data.table(a = c(NA, NA), b = c(NA, NA)))
writeLines(c(",", ",", ","), file)
test(2264.5, fread(file, blank.lines.skip="none"), data.table(V1 = c(NA, NA, NA), V2 = c(NA, NA, NA)))
2 changes: 1 addition & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ SEXP setcharvec(SEXP, SEXP, SEXP);
SEXP chmatch_R(SEXP, SEXP, SEXP);
SEXP chmatchdup_R(SEXP, SEXP, SEXP);
SEXP chin_R(SEXP, SEXP);
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP rbindlist(SEXP, SEXP, SEXP, SEXP);
SEXP setlistelt(SEXP, SEXP, SEXP);
Expand Down
25 changes: 15 additions & 10 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ static bool any_number_like_NAstrings=false;
static bool blank_is_a_NAstring=false;
static bool stripWhite=true; // only applies to character columns; numeric fields always stripped
static bool skipEmptyLines=false;
static bool keepLeadingWhite=false;
static int fill=0;
static int *dropFill = NULL;

Expand Down Expand Up @@ -174,6 +175,7 @@ bool freadCleanup(void)
blank_is_a_NAstring = false;
stripWhite = true;
skipEmptyLines = false;
keepLeadingWhite = false;
eol_one_r = false;
fill = 0;
// following are borrowed references: do not free
Expand Down Expand Up @@ -1357,6 +1359,7 @@ int freadMain(freadMainArgs _args) {

stripWhite = args.stripWhite;
skipEmptyLines = args.skipEmptyLines;
keepLeadingWhite = args.keepLeadingWhite;
fill = args.fill;
dec = args.dec;
quote = args.quote;
Expand Down Expand Up @@ -1597,14 +1600,16 @@ int freadMain(freadMainArgs _args) {

// skip blank input at the start
const char *lineStart = ch;
while (ch<eof && (isspace(*ch) || *ch=='\0')) { // isspace matches ' ', \t, \n and \r; \0 before eof should be skipped too
if (*ch=='\n') { ch++; lineStart=ch; row1line++; } else ch++;
}
if (ch>=eof) STOP(_("Input is either empty, fully whitespace, or skip has been set after the last non-whitespace."));
if (verbose) {
if (lineStart>ch) DTPRINT(_(" Moved forward to first non-blank line (%d)\n"), row1line);
DTPRINT(_(" Positioned on line %d starting: <<%s>>\n"), row1line, strlim(lineStart, 30));
}
if (!keepLeadingWhite) {
while (ch<eof && (isspace(*ch) || *ch=='\0')) { // isspace matches ' ', \t, \n and \r; \0 before eof should be skipped too
if (*ch=='\n') { ch++; lineStart=ch; row1line++; } else ch++;
}
if (ch>=eof) STOP(_("Input is either empty, fully whitespace, or skip has been set after the last non-whitespace."));
if (verbose) {
if (lineStart>ch) DTPRINT(_(" Moved forward to first non-blank line (%d)\n"), row1line);
DTPRINT(_(" Positioned on line %d starting: <<%s>>\n"), row1line, strlim(lineStart, 30));
}
} else if (verbose) DTPRINT(_("Kept leading white space"));
ch = pos = lineStart;
}

Expand Down Expand Up @@ -1693,7 +1698,7 @@ int freadMain(freadMainArgs _args) {
const char *prevLineStart=ch, *lineStart=ch;
int lastncol = countfields(&ch);
if (lastncol<0) continue; // invalid file with this sep and quote rule, skip
ASSERT(lastncol>0, "first non-empty row should always be at least one field; %c %d", sep, quoteRule); // # nocov
if (!keepLeadingWhite) ASSERT(lastncol>0, "first non-empty row should always be at least one field; %c %d", sep, quoteRule); // # nocov
const char *thisBlockStart=lineStart;
const char *thisBlockPrevStart = NULL;
int thisBlockLines=1, thisRow=0;
Expand Down Expand Up @@ -1782,7 +1787,7 @@ int freadMain(freadMainArgs _args) {
if (ncol<1 || row1line<1) STOP(_("Internal error: ncol==%d line==%d after detecting sep, ncol and first line"), ncol, row1line); // # nocov
int tt = countfields(&ch);
ch = pos; // move back to start of line since countfields() moved to next
if (!fill && tt!=ncol) STOP(_("Internal error: first line has field count %d but expecting %d"), tt, ncol); // # nocov
if (!fill && tt!=ncol && !keepLeadingWhite) STOP(_("Internal error: first line has field count %d but expecting %d"), tt, ncol); // # nocov
if (verbose) {
DTPRINT(_(" Detected %d columns on line %d. This line is either column names or first data row. Line starts as: <<%s>>\n"),
tt, row1line, strlim(pos, 30));
Expand Down
3 changes: 3 additions & 0 deletions src/fread.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ typedef struct freadMainArgs
// should datetime with no Z or UTZ-offset be read as UTC?
bool noTZasUTC;

// If true, then leading blank lines will be kept.
bool keepLeadingWhite;

char _padding[1];

// Any additional implementation-specific parameters.
Expand Down
9 changes: 7 additions & 2 deletions src/freadR.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ SEXP freadR(
SEXP integer64Arg,
SEXP encodingArg,
SEXP keepLeadingZerosArgs,
SEXP noTZasUTC
SEXP noTZasUTC,
SEXP keepLeadingWhite
) {
verbose = LOGICAL(verboseArg)[0];
warningsAreErrors = LOGICAL(warnings2errorsArg)[0];
Expand Down Expand Up @@ -152,7 +153,11 @@ SEXP freadR(

// here we use bool and rely on fread at R level to check these do not contain NA_LOGICAL
args.stripWhite = LOGICAL(stripWhiteArg)[0];
args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0];
if (!isString(skipEmptyLinesArg)) args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0];
else if (strcmp(CHAR(STRING_ELT(skipEmptyLinesArg,0)), "none")==0) {
args.skipEmptyLines = false;
args.keepLeadingWhite = true;
} else STOP(_("skipEmptyLines must be a logical or 'none'"));
args.fill = INTEGER(fillArg)[0];
args.showProgress = LOGICAL(showProgressArg)[0];
if (INTEGER(nThreadArg)[0]<1) error(_("nThread(%d)<1"), INTEGER(nThreadArg)[0]);
Expand Down
Loading