From 84d17b552b1008f4d8c010e17f37cc67d6b0e9af Mon Sep 17 00:00:00 2001 From: Pasha Stetsenko Date: Thu, 6 Jul 2017 18:36:14 -0700 Subject: [PATCH] Strip final 0x1A character in the input file, if found. Closes #1612 --- inst/tests/tests.Rraw | 2 ++ src/fread.c | 14 ++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3ec68e9b9f..d4bb69b021 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9957,6 +9957,8 @@ test(1769, fread("issue_2157_sampling_reached_eof_early.txt")[,c("X1","X2","X10" test(1800, fread("A\n6e55693457e549ecfce0\n"), data.table(A=c("6e55693457e549ecfce0"))) test(1800.1, fread("A\n1e55555555\n-1e+234056\n2e-59745"), data.table(A=c(Inf, -Inf, 0))) +test(1816, fread("A,E\n1,2\n5,7\n4,6\n\x1A"), data.table(A=c(1L, 5L, 4L), E=c(2L, 7L, 6L))) # see #1612 + ########################## # TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time. diff --git a/src/fread.c b/src/fread.c index f97e679947..889a5962db 100644 --- a/src/fread.c +++ b/src/fread.c @@ -792,8 +792,10 @@ int freadMain(freadMainArgs _args) { //********************************************************************************************* // [3] Check whether the file contains BOM (Byte Order Mark), and if yes - // strip it, modifying `sof`. Also, presence of BOM allows us to - // sometimes detect file's encoding. + // strip it, modifying `sof`. If the last byte in file is 0x1A (Ctrl+Z) + // then skip it too, modifying `eof`. + // + // Also, presence of BOM allows us to sometimes detect file's encoding. // See: https://en.wikipedia.org/wiki/Byte_order_mark // See: issues #1087 and #1465 //********************************************************************************************* @@ -801,17 +803,21 @@ int freadMain(freadMainArgs _args) { if (fileSize >= 3 && memcmp(sof, "\xEF\xBB\xBF", 3) == 0) { sof += 3; // ienc = CE_UTF8; - if (args.verbose) DTPRINT(" UTF-8 byte order mark EF BB BF found at the start of the file and skipped.\n"); + if (verbose) DTPRINT(" UTF-8 byte order mark EF BB BF found at the start of the file and skipped.\n"); } else if (fileSize >= 4 && memcmp(sof, "\x84\x31\x95\x33", 4) == 0) { sof += 4; // ienc = CE_GB18030; - if (args.verbose) DTPRINT(" GB-18030 byte order mark 84 31 95 33 found at the start of the file and skipped.\n"); + if (verbose) DTPRINT(" GB-18030 byte order mark 84 31 95 33 found at the start of the file and skipped.\n"); DTWARN("GB-18030 encoding detected, however fread() is unable to decode it. Some character fields may be garbled.\n"); } else if (fileSize >= 2 && sof[0] + sof[1] == '\xFE' + '\xFF') { // either 0xFE 0xFF or 0xFF 0xFE STOP("File is encoded in UTF-16, this encoding is not supported by fread(). Please recode the file to UTF-8."); } + if (eof[-1] == '\x1A') { + eof--; + if (verbose) DTPRINT(" Last byte in file found to be 0x1A (Ctrl+Z) and skipped.\n"); + } //*********************************************************************************************