From 3f81c543ab6c597d9bcb58735663cb20c7286486 Mon Sep 17 00:00:00 2001 From: Markus Lanthaler Date: Mon, 17 Apr 2017 18:51:53 +0200 Subject: [PATCH 1/4] Infer CSV delimiter if it hasn't been set explicitly --- src/PhpSpreadsheet/Reader/Csv.php | 80 ++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/src/PhpSpreadsheet/Reader/Csv.php b/src/PhpSpreadsheet/Reader/Csv.php index 29876f7785..fb1b7f64e5 100644 --- a/src/PhpSpreadsheet/Reader/Csv.php +++ b/src/PhpSpreadsheet/Reader/Csv.php @@ -40,7 +40,7 @@ class Csv extends BaseReader implements IReader * * @var string */ - private $delimiter = ','; + private $delimiter = null; /** * Enclosure. @@ -152,6 +152,82 @@ protected function checkSeparator() return $this->skipBOM(); } + /** + * Infer the separator if it isn't explicitly set in the file or specified by the user + */ + protected function inferSeparator() + { + if ($this->delimiter !== null) { + return; + } + + $potentialDelimiters = [ ',', ';', "\t", '|', ':', ' ' ]; + $count = array(); + foreach ($potentialDelimiters as $delimiter) { + $counts[$delimiter] = array(); + } + + // Count how many times each of the potential delimiters appears in each line + $numberLines = 0; + while (($line = fgets($this->fileHandle)) !== false && (++$numberLines < 1000)) { + $countLine = array(); + for ($i = strlen($line) - 1; $i >= 0; $i--) { + $char = $line[$i]; + if (isset($counts[$char])) { + if (!isset($countLine[$char])) { + $countLine[$char] = 0; + } + $countLine[$char]++; + } + } + foreach ($potentialDelimiters as $delimiter) { + $counts[$delimiter][] = isset($countLine[$delimiter]) + ? $countLine[$delimiter] + : 0; + } + } + + // Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently) + $meanSquareDeviations = array(); + $middleIdx = floor(($numberLines - 1) / 2); + + foreach ($potentialDelimiters as $delimiter) { + $series = $counts[$delimiter]; + sort($series); + + $median = ($numberLines % 2) + ? $series[$middleIdx] + : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2; + + if ($median === 0) { + continue; + } + + $meanSquareDeviations[$delimiter] = array_reduce($series, function ($sum, $value) use($median) { return $sum + pow($value - $median, 2); }) + / count($series); + } + + // ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected) + $min = INF; + foreach ($potentialDelimiters as $delimiter) { + if (!isset($meanSquareDeviations[$delimiter])) { + continue; + } + + if ($meanSquareDeviations[$delimiter] < $min) { + $min = $meanSquareDeviations[$delimiter]; + $this->delimiter = $delimiter; + } + } + + // If no delimiter could be detected, fall back to the default + if ($this->delimiter === null) { + $this->delimiter = reset($potentialDelimiters); + } + + return $this->skipBOM(); + } + /** * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns). * @@ -171,6 +247,7 @@ public function listWorksheetInfo($pFilename) // Skip BOM, if any $this->skipBOM(); $this->checkSeparator(); + $this->inferSeparator(); $worksheetInfo = []; $worksheetInfo[0]['worksheetName'] = 'Worksheet'; @@ -237,6 +314,7 @@ public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet) // Skip BOM, if any $this->skipBOM(); $this->checkSeparator(); + $this->inferSeparator(); // Create new PhpSpreadsheet object while ($spreadsheet->getSheetCount() <= $this->sheetIndex) { From f668ec7fb8f9e47017228e7dd071d5dea7d30097 Mon Sep 17 00:00:00 2001 From: Markus Lanthaler Date: Mon, 17 Apr 2017 19:25:05 +0200 Subject: [PATCH 2/4] Fix CS --- src/PhpSpreadsheet/Reader/Csv.php | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/PhpSpreadsheet/Reader/Csv.php b/src/PhpSpreadsheet/Reader/Csv.php index fb1b7f64e5..a088d1e564 100644 --- a/src/PhpSpreadsheet/Reader/Csv.php +++ b/src/PhpSpreadsheet/Reader/Csv.php @@ -153,7 +153,7 @@ protected function checkSeparator() } /** - * Infer the separator if it isn't explicitly set in the file or specified by the user + * Infer the separator if it isn't explicitly set in the file or specified by the user. */ protected function inferSeparator() { @@ -161,23 +161,23 @@ protected function inferSeparator() return; } - $potentialDelimiters = [ ',', ';', "\t", '|', ':', ' ' ]; - $count = array(); + $potentialDelimiters = [',', ';', "\t", '|', ':', ' ']; + $count = []; foreach ($potentialDelimiters as $delimiter) { - $counts[$delimiter] = array(); + $counts[$delimiter] = []; } // Count how many times each of the potential delimiters appears in each line $numberLines = 0; while (($line = fgets($this->fileHandle)) !== false && (++$numberLines < 1000)) { - $countLine = array(); - for ($i = strlen($line) - 1; $i >= 0; $i--) { + $countLine = []; + for ($i = strlen($line) - 1; $i >= 0; --$i) { $char = $line[$i]; if (isset($counts[$char])) { if (!isset($countLine[$char])) { $countLine[$char] = 0; } - $countLine[$char]++; + ++$countLine[$char]; } } foreach ($potentialDelimiters as $delimiter) { @@ -188,7 +188,7 @@ protected function inferSeparator() } // Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently) - $meanSquareDeviations = array(); + $meanSquareDeviations = []; $middleIdx = floor(($numberLines - 1) / 2); foreach ($potentialDelimiters as $delimiter) { @@ -203,8 +203,12 @@ protected function inferSeparator() continue; } - $meanSquareDeviations[$delimiter] = array_reduce($series, function ($sum, $value) use($median) { return $sum + pow($value - $median, 2); }) - / count($series); + $meanSquareDeviations[$delimiter] = array_reduce( + $series, + function ($sum, $value) use ($median) { + return $sum + pow($value - $median, 2); + } + ) / count($series); } // ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected) From 3b163dd93ba4ce9959b4347b27111f1be4f8ebe5 Mon Sep 17 00:00:00 2001 From: Markus Lanthaler Date: Wed, 19 Apr 2017 08:20:42 +0200 Subject: [PATCH 3/4] Test delimiter detection --- tests/PhpSpreadsheetTests/Reader/CsvTest.php | 14 ++++++++++++++ tests/data/Reader/CSV/semicolon_separated.csv | 3 +++ 2 files changed, 17 insertions(+) create mode 100644 tests/data/Reader/CSV/semicolon_separated.csv diff --git a/tests/PhpSpreadsheetTests/Reader/CsvTest.php b/tests/PhpSpreadsheetTests/Reader/CsvTest.php index ee092eb25a..6727b10b98 100644 --- a/tests/PhpSpreadsheetTests/Reader/CsvTest.php +++ b/tests/PhpSpreadsheetTests/Reader/CsvTest.php @@ -24,4 +24,18 @@ public function testEnclosure() $actual = $reloadedSpreadsheet->getActiveSheet()->getCell('A1')->getCalculatedValue(); $this->assertSame($value, $actual, 'should be able to write and read strings with multiples quotes'); } + + public function testDelimiterDetection() + { + $reader = new \PhpOffice\PhpSpreadsheet\Reader\Csv(); + $this->assertNull($reader->getDelimiter()); + + $filename = __DIR__ . '/../../data/Reader/CSV/semicolon_separated.csv'; + $spreadsheet = $reader->load($filename); + + $this->assertSame(';', $reader->getDelimiter(), 'should be able to infer the delimiter'); + + $actual = $spreadsheet->getActiveSheet()->getCell('C2')->getValue(); + $this->assertSame('25,5', $actual, 'should be able to retrieve values with commas'); + } } diff --git a/tests/data/Reader/CSV/semicolon_separated.csv b/tests/data/Reader/CSV/semicolon_separated.csv new file mode 100644 index 0000000000..811d8153c2 --- /dev/null +++ b/tests/data/Reader/CSV/semicolon_separated.csv @@ -0,0 +1,3 @@ +This;Are;Headers +Cell A2;Number with comma;25,5 +Two colons and a comma;B|3;:,: From e68f6b63f054999c939380a15577550f061af631 Mon Sep 17 00:00:00 2001 From: Markus Lanthaler Date: Wed, 19 Apr 2017 09:00:48 +0200 Subject: [PATCH 4/4] Fix typo --- src/PhpSpreadsheet/Reader/Csv.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PhpSpreadsheet/Reader/Csv.php b/src/PhpSpreadsheet/Reader/Csv.php index a088d1e564..c0d064b72e 100644 --- a/src/PhpSpreadsheet/Reader/Csv.php +++ b/src/PhpSpreadsheet/Reader/Csv.php @@ -162,7 +162,7 @@ protected function inferSeparator() } $potentialDelimiters = [',', ';', "\t", '|', ':', ' ']; - $count = []; + $counts = []; foreach ($potentialDelimiters as $delimiter) { $counts[$delimiter] = []; }