Skip to content

Commit

Permalink
Infer CSV delimiter if it hasn't been set explicitly
Browse files Browse the repository at this point in the history
Closes #141
  • Loading branch information
lanthaler authored and PowerKiKi committed Apr 20, 2017
1 parent 0bd3a9c commit 3ee9cc5
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
- Initial implementation of SUMIFS() function
- Additional codepages
- MemoryDrawing not working in HTML writer [#808](https://github.com/PHPOffice/PHPExcel/issues/808)
- CSV Reader can auto-detect the separator used in file [#141](https://github.com/PHPOffice/PhpSpreadsheet/pull/141)

### Changed

Expand Down
6 changes: 5 additions & 1 deletion docs/topics/reading-and-writing-to-file.md
Original file line number Diff line number Diff line change
Expand Up @@ -435,11 +435,15 @@ $spreadsheet = $reader->load("sample.csv");

#### Setting CSV options

Often, CSV files are not really "comma separated", or use semicolon (;)
Often, CSV files are not really "comma separated", or use semicolon (`;`)
as a separator. You can instruct
\PhpOffice\PhpSpreadsheet\Reader\Csv some options before reading a CSV
file.

The separator will be auto-detected, so in most cases it should not be necessary
to specify it. But in cases where auto-detection does not fit the use-case, then
it can be set manually.

Note that \PhpOffice\PhpSpreadsheet\Reader\Csv by default assumes that
the loaded CSV file is UTF-8 encoded. If you are reading CSV files that
were created in Microsoft Office Excel the correct input encoding may
Expand Down
10 changes: 6 additions & 4 deletions docs/topics/reading-files.md
Original file line number Diff line number Diff line change
Expand Up @@ -523,15 +523,17 @@ CSV | YES | HTML | NO

### Pipe or Tab Separated Value Files

The CSV loader defaults to loading a file where comma is used as the
separator, but you can modify this to load tab- or pipe-separated value
files using the `setDelimiter()` method.
The CSV loader will attempt to auto-detect the separator used in the file. If it
cannot auto-detect, it will default to the comma. If this does not fit your
use-case, you can manually specify a separator by using the `setDelimiter()`
method.

``` php
$inputFileType = 'Csv';
$inputFileName = './sampleData/example1.tsv';

/** Create a new Reader of the type defined in $inputFileType **/ $reader = \PhpOffice\PhpSpreadsheet\IOFactory::createReader($inputFileType);
/** Create a new Reader of the type defined in $inputFileType **/
$reader = \PhpOffice\PhpSpreadsheet\IOFactory::createReader($inputFileType);
/** Set the delimiter to a TAB character **/
$reader->setDelimiter("\t");
// $reader->setDelimiter('|');
Expand Down
84 changes: 83 additions & 1 deletion src/PhpSpreadsheet/Reader/Csv.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class Csv extends BaseReader implements IReader
*
* @var string
*/
private $delimiter = ',';
private $delimiter = null;

/**
* Enclosure.
Expand Down Expand Up @@ -152,6 +152,86 @@ protected function checkSeparator()
return $this->skipBOM();
}

/**
* Infer the separator if it isn't explicitly set in the file or specified by the user.
*/
protected function inferSeparator()
{
if ($this->delimiter !== null) {
return;
}

$potentialDelimiters = [',', ';', "\t", '|', ':', ' '];
$counts = [];
foreach ($potentialDelimiters as $delimiter) {
$counts[$delimiter] = [];
}

// Count how many times each of the potential delimiters appears in each line
$numberLines = 0;
while (($line = fgets($this->fileHandle)) !== false && (++$numberLines < 1000)) {
$countLine = [];
for ($i = strlen($line) - 1; $i >= 0; --$i) {
$char = $line[$i];
if (isset($counts[$char])) {
if (!isset($countLine[$char])) {
$countLine[$char] = 0;
}
++$countLine[$char];
}
}
foreach ($potentialDelimiters as $delimiter) {
$counts[$delimiter][] = isset($countLine[$delimiter])
? $countLine[$delimiter]
: 0;
}
}

// Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently)
$meanSquareDeviations = [];
$middleIdx = floor(($numberLines - 1) / 2);

foreach ($potentialDelimiters as $delimiter) {
$series = $counts[$delimiter];
sort($series);

$median = ($numberLines % 2)
? $series[$middleIdx]
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;

if ($median === 0) {
continue;
}

$meanSquareDeviations[$delimiter] = array_reduce(
$series,
function ($sum, $value) use ($median) {
return $sum + pow($value - $median, 2);
}
) / count($series);
}

// ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected)
$min = INF;
foreach ($potentialDelimiters as $delimiter) {
if (!isset($meanSquareDeviations[$delimiter])) {
continue;
}

if ($meanSquareDeviations[$delimiter] < $min) {
$min = $meanSquareDeviations[$delimiter];
$this->delimiter = $delimiter;
}
}

// If no delimiter could be detected, fall back to the default
if ($this->delimiter === null) {
$this->delimiter = reset($potentialDelimiters);
}

return $this->skipBOM();
}

/**
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
*
Expand All @@ -171,6 +251,7 @@ public function listWorksheetInfo($pFilename)
// Skip BOM, if any
$this->skipBOM();
$this->checkSeparator();
$this->inferSeparator();

$worksheetInfo = [];
$worksheetInfo[0]['worksheetName'] = 'Worksheet';
Expand Down Expand Up @@ -237,6 +318,7 @@ public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
// Skip BOM, if any
$this->skipBOM();
$this->checkSeparator();
$this->inferSeparator();

// Create new PhpSpreadsheet object
while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
Expand Down
14 changes: 14 additions & 0 deletions tests/PhpSpreadsheetTests/Reader/CsvTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,18 @@ public function testEnclosure()
$actual = $reloadedSpreadsheet->getActiveSheet()->getCell('A1')->getCalculatedValue();
$this->assertSame($value, $actual, 'should be able to write and read strings with multiples quotes');
}

public function testDelimiterDetection()
{
$reader = new \PhpOffice\PhpSpreadsheet\Reader\Csv();
$this->assertNull($reader->getDelimiter());

$filename = __DIR__ . '/../../data/Reader/CSV/semicolon_separated.csv';
$spreadsheet = $reader->load($filename);

$this->assertSame(';', $reader->getDelimiter(), 'should be able to infer the delimiter');

$actual = $spreadsheet->getActiveSheet()->getCell('C2')->getValue();
$this->assertSame('25,5', $actual, 'should be able to retrieve values with commas');
}
}
3 changes: 3 additions & 0 deletions tests/data/Reader/CSV/semicolon_separated.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This;Are;Headers
Cell A2;Number with comma;25,5
Two colons and a comma;B|3;:,:

0 comments on commit 3ee9cc5

Please sign in to comment.