-
-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Properly determine CSV delimiter #17459
Changes from 26 commits
fc8dd28
6fe4f9f
c99d94e
58e98c5
56f0995
7da56ba
5b3a8d9
b0a6290
79ee659
f508559
e0f2862
ab0583c
e466ab7
a54e6ac
62d10e3
0a2f06d
85eda5f
fc02c05
baf37d3
32a9c36
e8ac38a
feabe0e
f808d73
0084114
967da1d
683def1
95ede28
b938db4
772095d
8a54f7a
059ae3d
88a2642
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,16 +7,18 @@ package csv | |
import ( | ||
"bytes" | ||
stdcsv "encoding/csv" | ||
"errors" | ||
"io" | ||
"path/filepath" | ||
"regexp" | ||
"strings" | ||
|
||
"code.gitea.io/gitea/modules/markup" | ||
"code.gitea.io/gitea/modules/translation" | ||
"code.gitea.io/gitea/modules/util" | ||
) | ||
|
||
var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`) | ||
const maxLines = 10 | ||
const guessSampleSize = 1e4 // 10k | ||
|
||
// CreateReader creates a csv.Reader with the given delimiter. | ||
func CreateReader(input io.Reader, delimiter rune) *stdcsv.Reader { | ||
|
@@ -30,70 +32,95 @@ func CreateReader(input io.Reader, delimiter rune) *stdcsv.Reader { | |
return rd | ||
} | ||
|
||
// CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader. | ||
// Reads at most 10k bytes. | ||
func CreateReaderAndGuessDelimiter(rd io.Reader) (*stdcsv.Reader, error) { | ||
var data = make([]byte, 1e4) | ||
// CreateReaderAndDetermineDelimiter tries to guess the field delimiter from the content and creates a csv.Reader. | ||
// Reads at most guessSampleSize bytes. | ||
func CreateReaderAndDetermineDelimiter(ctx *markup.RenderContext, rd io.Reader) (*stdcsv.Reader, error) { | ||
var data = make([]byte, guessSampleSize) | ||
size, err := util.ReadAtMost(rd, data) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return CreateReader( | ||
io.MultiReader(bytes.NewReader(data[:size]), rd), | ||
guessDelimiter(data[:size]), | ||
determineDelimiter(ctx, data[:size]), | ||
), nil | ||
} | ||
|
||
// guessDelimiter scores the input CSV data against delimiters, and returns the best match. | ||
func guessDelimiter(data []byte) rune { | ||
maxLines := 10 | ||
text := quoteRegexp.ReplaceAllLiteralString(string(data), "") | ||
lines := strings.SplitN(text, "\n", maxLines+1) | ||
lines = lines[:util.Min(maxLines, len(lines))] | ||
|
||
delimiters := []rune{',', ';', '\t', '|', '@'} | ||
bestDelim := delimiters[0] | ||
bestScore := 0.0 | ||
for _, delim := range delimiters { | ||
score := scoreDelimiter(lines, delim) | ||
if score > bestScore { | ||
bestScore = score | ||
bestDelim = delim | ||
} | ||
// determineDelimiter takes a RenderContext and if it isn't nil and the Filename has an extension that specifies the delimiter, | ||
// it is used as the delimiter. Otherwise we call guessDelimiter with the data passed | ||
func determineDelimiter(ctx *markup.RenderContext, data []byte) rune { | ||
extension := ".csv" | ||
if ctx != nil { | ||
extension = strings.ToLower(filepath.Ext(ctx.Filename)) | ||
} | ||
|
||
var delimiter rune | ||
switch extension { | ||
case ".tsv": | ||
delimiter = '\t' | ||
case ".psv": | ||
delimiter = '|' | ||
default: | ||
delimiter = guessDelimiter(data) | ||
} | ||
|
||
return bestDelim | ||
return delimiter | ||
} | ||
|
||
// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV. | ||
func scoreDelimiter(lines []string, delim rune) float64 { | ||
countTotal := 0 | ||
countLineMax := 0 | ||
linesNotEqual := 0 | ||
// quoteRegexp follows the RFC-4180 CSV standard for when double-quotes are used to enclose fields, then a double-quote appearing inside a | ||
// field must be escaped by preceding it with another double quote. https://www.ietf.org/rfc/rfc4180.txt | ||
// This finds all quoted strings that have escaped quotes. | ||
var quoteRegexp = regexp.MustCompile(`"[^"]*"`) | ||
|
||
for _, line := range lines { | ||
if len(line) == 0 { | ||
continue | ||
} | ||
// removeQuotedStrings uses the quoteRegexp to remove all quoted strings so that we can realiably have each row on one line | ||
zeripath marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// (quoted strings often have new lines within the string) | ||
func removeQuotedString(text string) string { | ||
return quoteRegexp.ReplaceAllLiteralString(text, "") | ||
} | ||
|
||
countLine := strings.Count(line, string(delim)) | ||
countTotal += countLine | ||
if countLine != countLineMax { | ||
if countLineMax != 0 { | ||
linesNotEqual++ | ||
} | ||
countLineMax = util.Max(countLine, countLineMax) | ||
} | ||
// guessDelimiter takes up to maxLines of the CSV text, iterates through the possible delimiters, and sees if the CSV Reader reads it without throwing any errors. | ||
// If more than one delmiiter passes, the delimiter that results in the most columns is returned. | ||
richmahn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
func guessDelimiter(data []byte) rune { | ||
delimiter := guessFromBeforeAfterQuotes(data) | ||
if delimiter != 0 { | ||
return delimiter | ||
} | ||
|
||
return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines))) | ||
// Removes quoted values so we don't have columns with new lines in them | ||
text := removeQuotedString(string(data)) | ||
zeripath marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// Make the text just be maxLines or less without truncated lines | ||
richmahn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
lines := strings.SplitN(text, "\n", maxLines+1) // Will contain at least one line, and if there are more than MaxLines, the last item holds the rest of the lines | ||
if len(lines) > maxLines { | ||
// If the length of lines is > maxLines we know we have the max number of lines, trim it to maxLines | ||
lines = lines[:maxLines] | ||
} else if len(lines) > 1 && len(data) >= guessSampleSize { | ||
// Even with data >= guessSampleSize, we don't have maxLines + 1 (no extra lines, must have really long lines) | ||
// thus the last line is probably have a truncated line. Drop the last line if len(lines) > 1 | ||
lines = lines[:len(lines)-1] | ||
} | ||
|
||
// Put lines back together as a string | ||
text = strings.Join(lines, "\n") | ||
|
||
delimiters := []rune{',', '\t', ';', '|', '@'} | ||
validDelim := delimiters[0] | ||
validDelimColCount := 0 | ||
for _, delim := range delimiters { | ||
csvReader := stdcsv.NewReader(strings.NewReader(text)) | ||
csvReader.Comma = delim | ||
if rows, err := csvReader.ReadAll(); err == nil && len(rows) > 0 && len(rows[0]) > validDelimColCount { | ||
validDelim = delim | ||
validDelimColCount = len(rows[0]) | ||
} | ||
} | ||
return validDelim | ||
} | ||
|
||
// FormatError converts csv errors into readable messages. | ||
func FormatError(err error, locale translation.Locale) (string, error) { | ||
var perr *stdcsv.ParseError | ||
if errors.As(err, &perr) { | ||
if perr, ok := err.(*stdcsv.ParseError); ok { | ||
zeripath marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if perr.Err == stdcsv.ErrFieldCount { | ||
return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil | ||
} | ||
|
@@ -102,3 +129,20 @@ func FormatError(err error, locale translation.Locale) (string, error) { | |
|
||
return "", err | ||
} | ||
|
||
// Looks for possible delimiters right before or after (with spaces after the former) double quotes with closing quotes | ||
var beforeAfterQuotes = regexp.MustCompile(`([,@\t;|]{0,1}) *(?:"[^"]*")+([,@\t;|]{0,1})`) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So if I see that correctly, adding tabs between the separator (i.e. comma) and a quoted value, we will have an incorrectly guessed delimiter (tab)? I don't think we've seen the last of this issue with this approach. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @delvh Do people add tabs after a separator? I know many people find a comma delimited csv to be easier to read with a space after the comma or semicolon:
But I can't image in doing
Are you suggesting to allow [\s]* instead of just a space so it gets all white space? Yet then I wonder if that could mess up when there are two tabs because no value for that cell
Maybe
as the regex? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just realized, the above regex I suggested would now fail on this, guessing comma instead of tab, even though the first row uses a tab:
Here the comma is part of the text. I think if this is the case, we should just use the guestDelimiter() function and not try to do this before/after quote as it might guess wrong. If we still use it, we should probably try it on the first row to make sure we get more than two columns returned, or maybe in guessDelimiter() we call guessFromBeforeAfterQuotes and we try that on the 10 lines first, rather than trying all delimiters and if it passes, just return it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm voting it is just best to go through the list of the delimiters and find what works for the sample 1 - 10 lines, removing all quoted strings so we have no values containing new lines. If we do this "find delimiter before/after quote" we still have to check if it works with the sample text. |
||
|
||
// guessFromBeforeAfterQuotes guesses the limiter by finding a double quote that has a valid delimiter before it and a closing quote, | ||
// or a double quote with a closing quote and a valid delimiter after it | ||
func guessFromBeforeAfterQuotes(data []byte) rune { | ||
rs := beforeAfterQuotes.FindStringSubmatch(string(data)) | ||
if rs != nil { | ||
if rs[1] != "" { | ||
return rune(rs[1][0]) | ||
} else if rs[2] != "" { | ||
return rune(rs[2][0]) | ||
} | ||
lunny marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
return 0 | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we count file size in decimal or binary units?
Because in binary, this number should be a bit larger (10192, or 2^13, if I'm not mistaken).
I know that oftentimes, compilers prefer powers of two as those can be optimized better. I guess that should be the case here as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't something I added, as it was always a sample size of 10,000. So yeah, it is using 'k' here in the comments that were there before I worked on this as decimal. It's just an arbitrary sample size. I made it a constant so I can check if we may have a truncated line when getting 10 lines from that sample text.
See: https://github.com/go-gitea/gitea/blob/main/modules/csv/csv.go#L34
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. And I'm arguing to set it to a value that the compiler will be able to optimize better.
One small correction, though: Apparently, I thought that 2^(11+1) would be 5096, instead of 4096.
So, what I am asking for is to at best use either
1<<13
(8192
) or1<<14
(16384
) bytes.Which do you prefer?
Or do you prefer the
10000
, which is slightly easier to read, but will be worse in runtime performance?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't matter whatever we choose, 1e4, 16384, 8192, all work, you won't feel any difference.
We are not writing an OS or a memory allocation library, we won't benefit from the page-aligned memory size.
We do not need to struggle on such trivial details.