From 3ee9cc5ce67d4b4331f2187a5347cb2ee9e3f49d Mon Sep 17 00:00:00 2001 From: Markus Lanthaler Date: Mon, 17 Apr 2017 18:51:53 +0200 Subject: [PATCH] Infer CSV delimiter if it hasn't been set explicitly Closes #141 --- CHANGELOG.md | 1 + docs/topics/reading-and-writing-to-file.md | 6 +- docs/topics/reading-files.md | 10 ++- src/PhpSpreadsheet/Reader/Csv.php | 84 ++++++++++++++++++- tests/PhpSpreadsheetTests/Reader/CsvTest.php | 14 ++++ tests/data/Reader/CSV/semicolon_separated.csv | 3 + 6 files changed, 112 insertions(+), 6 deletions(-) create mode 100644 tests/data/Reader/CSV/semicolon_separated.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index fbb3a4f3b5..30c2839236 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/). - Initial implementation of SUMIFS() function - Additional codepages - MemoryDrawing not working in HTML writer [#808](https://github.com/PHPOffice/PHPExcel/issues/808) +- CSV Reader can auto-detect the separator used in file [#141](https://github.com/PHPOffice/PhpSpreadsheet/pull/141) ### Changed diff --git a/docs/topics/reading-and-writing-to-file.md b/docs/topics/reading-and-writing-to-file.md index 70bd7e435f..5bc05b749d 100644 --- a/docs/topics/reading-and-writing-to-file.md +++ b/docs/topics/reading-and-writing-to-file.md @@ -435,11 +435,15 @@ $spreadsheet = $reader->load("sample.csv"); #### Setting CSV options -Often, CSV files are not really "comma separated", or use semicolon (;) +Often, CSV files are not really "comma separated", or use semicolon (`;`) as a separator. You can instruct \PhpOffice\PhpSpreadsheet\Reader\Csv some options before reading a CSV file. +The separator will be auto-detected, so in most cases it should not be necessary +to specify it. But in cases where auto-detection does not fit the use-case, then +it can be set manually. + Note that \PhpOffice\PhpSpreadsheet\Reader\Csv by default assumes that the loaded CSV file is UTF-8 encoded. If you are reading CSV files that were created in Microsoft Office Excel the correct input encoding may diff --git a/docs/topics/reading-files.md b/docs/topics/reading-files.md index 7f44bb285b..afd3db35ff 100644 --- a/docs/topics/reading-files.md +++ b/docs/topics/reading-files.md @@ -523,15 +523,17 @@ CSV | YES | HTML | NO ### Pipe or Tab Separated Value Files -The CSV loader defaults to loading a file where comma is used as the -separator, but you can modify this to load tab- or pipe-separated value -files using the `setDelimiter()` method. +The CSV loader will attempt to auto-detect the separator used in the file. If it +cannot auto-detect, it will default to the comma. If this does not fit your +use-case, you can manually specify a separator by using the `setDelimiter()` +method. ``` php $inputFileType = 'Csv'; $inputFileName = './sampleData/example1.tsv'; -/** Create a new Reader of the type defined in $inputFileType **/ $reader = \PhpOffice\PhpSpreadsheet\IOFactory::createReader($inputFileType); +/** Create a new Reader of the type defined in $inputFileType **/ +$reader = \PhpOffice\PhpSpreadsheet\IOFactory::createReader($inputFileType); /** Set the delimiter to a TAB character **/ $reader->setDelimiter("\t"); // $reader->setDelimiter('|'); diff --git a/src/PhpSpreadsheet/Reader/Csv.php b/src/PhpSpreadsheet/Reader/Csv.php index 29876f7785..c0d064b72e 100644 --- a/src/PhpSpreadsheet/Reader/Csv.php +++ b/src/PhpSpreadsheet/Reader/Csv.php @@ -40,7 +40,7 @@ class Csv extends BaseReader implements IReader * * @var string */ - private $delimiter = ','; + private $delimiter = null; /** * Enclosure. @@ -152,6 +152,86 @@ protected function checkSeparator() return $this->skipBOM(); } + /** + * Infer the separator if it isn't explicitly set in the file or specified by the user. + */ + protected function inferSeparator() + { + if ($this->delimiter !== null) { + return; + } + + $potentialDelimiters = [',', ';', "\t", '|', ':', ' ']; + $counts = []; + foreach ($potentialDelimiters as $delimiter) { + $counts[$delimiter] = []; + } + + // Count how many times each of the potential delimiters appears in each line + $numberLines = 0; + while (($line = fgets($this->fileHandle)) !== false && (++$numberLines < 1000)) { + $countLine = []; + for ($i = strlen($line) - 1; $i >= 0; --$i) { + $char = $line[$i]; + if (isset($counts[$char])) { + if (!isset($countLine[$char])) { + $countLine[$char] = 0; + } + ++$countLine[$char]; + } + } + foreach ($potentialDelimiters as $delimiter) { + $counts[$delimiter][] = isset($countLine[$delimiter]) + ? $countLine[$delimiter] + : 0; + } + } + + // Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently) + $meanSquareDeviations = []; + $middleIdx = floor(($numberLines - 1) / 2); + + foreach ($potentialDelimiters as $delimiter) { + $series = $counts[$delimiter]; + sort($series); + + $median = ($numberLines % 2) + ? $series[$middleIdx] + : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2; + + if ($median === 0) { + continue; + } + + $meanSquareDeviations[$delimiter] = array_reduce( + $series, + function ($sum, $value) use ($median) { + return $sum + pow($value - $median, 2); + } + ) / count($series); + } + + // ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected) + $min = INF; + foreach ($potentialDelimiters as $delimiter) { + if (!isset($meanSquareDeviations[$delimiter])) { + continue; + } + + if ($meanSquareDeviations[$delimiter] < $min) { + $min = $meanSquareDeviations[$delimiter]; + $this->delimiter = $delimiter; + } + } + + // If no delimiter could be detected, fall back to the default + if ($this->delimiter === null) { + $this->delimiter = reset($potentialDelimiters); + } + + return $this->skipBOM(); + } + /** * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns). * @@ -171,6 +251,7 @@ public function listWorksheetInfo($pFilename) // Skip BOM, if any $this->skipBOM(); $this->checkSeparator(); + $this->inferSeparator(); $worksheetInfo = []; $worksheetInfo[0]['worksheetName'] = 'Worksheet'; @@ -237,6 +318,7 @@ public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet) // Skip BOM, if any $this->skipBOM(); $this->checkSeparator(); + $this->inferSeparator(); // Create new PhpSpreadsheet object while ($spreadsheet->getSheetCount() <= $this->sheetIndex) { diff --git a/tests/PhpSpreadsheetTests/Reader/CsvTest.php b/tests/PhpSpreadsheetTests/Reader/CsvTest.php index ee092eb25a..6727b10b98 100644 --- a/tests/PhpSpreadsheetTests/Reader/CsvTest.php +++ b/tests/PhpSpreadsheetTests/Reader/CsvTest.php @@ -24,4 +24,18 @@ public function testEnclosure() $actual = $reloadedSpreadsheet->getActiveSheet()->getCell('A1')->getCalculatedValue(); $this->assertSame($value, $actual, 'should be able to write and read strings with multiples quotes'); } + + public function testDelimiterDetection() + { + $reader = new \PhpOffice\PhpSpreadsheet\Reader\Csv(); + $this->assertNull($reader->getDelimiter()); + + $filename = __DIR__ . '/../../data/Reader/CSV/semicolon_separated.csv'; + $spreadsheet = $reader->load($filename); + + $this->assertSame(';', $reader->getDelimiter(), 'should be able to infer the delimiter'); + + $actual = $spreadsheet->getActiveSheet()->getCell('C2')->getValue(); + $this->assertSame('25,5', $actual, 'should be able to retrieve values with commas'); + } } diff --git a/tests/data/Reader/CSV/semicolon_separated.csv b/tests/data/Reader/CSV/semicolon_separated.csv new file mode 100644 index 0000000000..811d8153c2 --- /dev/null +++ b/tests/data/Reader/CSV/semicolon_separated.csv @@ -0,0 +1,3 @@ +This;Are;Headers +Cell A2;Number with comma;25,5 +Two colons and a comma;B|3;:,: