Skip to content

Commit

Permalink
Strip final 0x1A character in the input file, if found. Closes #1612
Browse files Browse the repository at this point in the history
  • Loading branch information
st-pasha committed Jul 7, 2017
1 parent 3f29b60 commit 84d17b5
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 4 deletions.
2 changes: 2 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -9957,6 +9957,8 @@ test(1769, fread("issue_2157_sampling_reached_eof_early.txt")[,c("X1","X2","X10"
test(1800, fread("A\n6e55693457e549ecfce0\n"), data.table(A=c("6e55693457e549ecfce0")))
test(1800.1, fread("A\n1e55555555\n-1e+234056\n2e-59745"), data.table(A=c(Inf, -Inf, 0)))

test(1816, fread("A,E\n1,2\n5,7\n4,6\n\x1A"), data.table(A=c(1L, 5L, 4L), E=c(2L, 7L, 6L))) # see #1612

##########################

# TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time.
Expand Down
14 changes: 10 additions & 4 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -792,26 +792,32 @@ int freadMain(freadMainArgs _args) {

//*********************************************************************************************
// [3] Check whether the file contains BOM (Byte Order Mark), and if yes
// strip it, modifying `sof`. Also, presence of BOM allows us to
// sometimes detect file's encoding.
// strip it, modifying `sof`. If the last byte in file is 0x1A (Ctrl+Z)
// then skip it too, modifying `eof`.
//
// Also, presence of BOM allows us to sometimes detect file's encoding.
// See: https://en.wikipedia.org/wiki/Byte_order_mark
// See: issues #1087 and #1465
//*********************************************************************************************
if (verbose) DTPRINT("[3] Detect and skip BOM\n");
if (fileSize >= 3 && memcmp(sof, "\xEF\xBB\xBF", 3) == 0) {
sof += 3;
// ienc = CE_UTF8;
if (args.verbose) DTPRINT(" UTF-8 byte order mark EF BB BF found at the start of the file and skipped.\n");
if (verbose) DTPRINT(" UTF-8 byte order mark EF BB BF found at the start of the file and skipped.\n");
}
else if (fileSize >= 4 && memcmp(sof, "\x84\x31\x95\x33", 4) == 0) {
sof += 4;
// ienc = CE_GB18030;
if (args.verbose) DTPRINT(" GB-18030 byte order mark 84 31 95 33 found at the start of the file and skipped.\n");
if (verbose) DTPRINT(" GB-18030 byte order mark 84 31 95 33 found at the start of the file and skipped.\n");
DTWARN("GB-18030 encoding detected, however fread() is unable to decode it. Some character fields may be garbled.\n");
}
else if (fileSize >= 2 && sof[0] + sof[1] == '\xFE' + '\xFF') { // either 0xFE 0xFF or 0xFF 0xFE
STOP("File is encoded in UTF-16, this encoding is not supported by fread(). Please recode the file to UTF-8.");
}
if (eof[-1] == '\x1A') {
eof--;
if (verbose) DTPRINT(" Last byte in file found to be 0x1A (Ctrl+Z) and skipped.\n");
}


//*********************************************************************************************
Expand Down

0 comments on commit 84d17b5

Please sign in to comment.