diff --git a/src/PhpSpreadsheet/Reader/Csv.php b/src/PhpSpreadsheet/Reader/Csv.php index 29876f7785..c0d064b72e 100644 --- a/src/PhpSpreadsheet/Reader/Csv.php +++ b/src/PhpSpreadsheet/Reader/Csv.php @@ -40,7 +40,7 @@ class Csv extends BaseReader implements IReader * * @var string */ - private $delimiter = ','; + private $delimiter = null; /** * Enclosure. @@ -152,6 +152,86 @@ protected function checkSeparator() return $this->skipBOM(); } + /** + * Infer the separator if it isn't explicitly set in the file or specified by the user. + */ + protected function inferSeparator() + { + if ($this->delimiter !== null) { + return; + } + + $potentialDelimiters = [',', ';', "\t", '|', ':', ' ']; + $counts = []; + foreach ($potentialDelimiters as $delimiter) { + $counts[$delimiter] = []; + } + + // Count how many times each of the potential delimiters appears in each line + $numberLines = 0; + while (($line = fgets($this->fileHandle)) !== false && (++$numberLines < 1000)) { + $countLine = []; + for ($i = strlen($line) - 1; $i >= 0; --$i) { + $char = $line[$i]; + if (isset($counts[$char])) { + if (!isset($countLine[$char])) { + $countLine[$char] = 0; + } + ++$countLine[$char]; + } + } + foreach ($potentialDelimiters as $delimiter) { + $counts[$delimiter][] = isset($countLine[$delimiter]) + ? $countLine[$delimiter] + : 0; + } + } + + // Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently) + $meanSquareDeviations = []; + $middleIdx = floor(($numberLines - 1) / 2); + + foreach ($potentialDelimiters as $delimiter) { + $series = $counts[$delimiter]; + sort($series); + + $median = ($numberLines % 2) + ? $series[$middleIdx] + : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2; + + if ($median === 0) { + continue; + } + + $meanSquareDeviations[$delimiter] = array_reduce( + $series, + function ($sum, $value) use ($median) { + return $sum + pow($value - $median, 2); + } + ) / count($series); + } + + // ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected) + $min = INF; + foreach ($potentialDelimiters as $delimiter) { + if (!isset($meanSquareDeviations[$delimiter])) { + continue; + } + + if ($meanSquareDeviations[$delimiter] < $min) { + $min = $meanSquareDeviations[$delimiter]; + $this->delimiter = $delimiter; + } + } + + // If no delimiter could be detected, fall back to the default + if ($this->delimiter === null) { + $this->delimiter = reset($potentialDelimiters); + } + + return $this->skipBOM(); + } + /** * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns). * @@ -171,6 +251,7 @@ public function listWorksheetInfo($pFilename) // Skip BOM, if any $this->skipBOM(); $this->checkSeparator(); + $this->inferSeparator(); $worksheetInfo = []; $worksheetInfo[0]['worksheetName'] = 'Worksheet'; @@ -237,6 +318,7 @@ public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet) // Skip BOM, if any $this->skipBOM(); $this->checkSeparator(); + $this->inferSeparator(); // Create new PhpSpreadsheet object while ($spreadsheet->getSheetCount() <= $this->sheetIndex) { diff --git a/tests/PhpSpreadsheetTests/Reader/CsvTest.php b/tests/PhpSpreadsheetTests/Reader/CsvTest.php index ee092eb25a..6727b10b98 100644 --- a/tests/PhpSpreadsheetTests/Reader/CsvTest.php +++ b/tests/PhpSpreadsheetTests/Reader/CsvTest.php @@ -24,4 +24,18 @@ public function testEnclosure() $actual = $reloadedSpreadsheet->getActiveSheet()->getCell('A1')->getCalculatedValue(); $this->assertSame($value, $actual, 'should be able to write and read strings with multiples quotes'); } + + public function testDelimiterDetection() + { + $reader = new \PhpOffice\PhpSpreadsheet\Reader\Csv(); + $this->assertNull($reader->getDelimiter()); + + $filename = __DIR__ . '/../../data/Reader/CSV/semicolon_separated.csv'; + $spreadsheet = $reader->load($filename); + + $this->assertSame(';', $reader->getDelimiter(), 'should be able to infer the delimiter'); + + $actual = $spreadsheet->getActiveSheet()->getCell('C2')->getValue(); + $this->assertSame('25,5', $actual, 'should be able to retrieve values with commas'); + } } diff --git a/tests/data/Reader/CSV/semicolon_separated.csv b/tests/data/Reader/CSV/semicolon_separated.csv new file mode 100644 index 0000000000..811d8153c2 --- /dev/null +++ b/tests/data/Reader/CSV/semicolon_separated.csv @@ -0,0 +1,3 @@ +This;Are;Headers +Cell A2;Number with comma;25,5 +Two colons and a comma;B|3;:,: