go-gitea · lunny · Oct 30, 2021 · Oct 25, 2021 · Oct 25, 2021 · Oct 26, 2021
diff --git a/modules/csv/csv.go b/modules/csv/csv.go
@@ -7,16 +7,18 @@ package csv
 import (
 	"bytes"
 	stdcsv "encoding/csv"
-	"errors"
 	"io"
+	"path/filepath"
 	"regexp"
 	"strings"
 
+	"code.gitea.io/gitea/modules/markup"
 	"code.gitea.io/gitea/modules/translation"
 	"code.gitea.io/gitea/modules/util"
 )
 
-var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
+const maxLines = 10
+const guessSampleSize = 1e4 // 10k
 
 // CreateReader creates a csv.Reader with the given delimiter.
 func CreateReader(input io.Reader, delimiter rune) *stdcsv.Reader {
@@ -30,70 +32,95 @@ func CreateReader(input io.Reader, delimiter rune) *stdcsv.Reader {
 	return rd
 }
 
-// CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
-// Reads at most 10k bytes.
-func CreateReaderAndGuessDelimiter(rd io.Reader) (*stdcsv.Reader, error) {
-	var data = make([]byte, 1e4)
+// CreateReaderAndDetermineDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
+// Reads at most guessSampleSize bytes.
+func CreateReaderAndDetermineDelimiter(ctx *markup.RenderContext, rd io.Reader) (*stdcsv.Reader, error) {
+	var data = make([]byte, guessSampleSize)
 	size, err := util.ReadAtMost(rd, data)
 	if err != nil {
 		return nil, err
 	}
 
 	return CreateReader(
 		io.MultiReader(bytes.NewReader(data[:size]), rd),
-		guessDelimiter(data[:size]),
+		determineDelimiter(ctx, data[:size]),
 	), nil
 }
 
-// guessDelimiter scores the input CSV data against delimiters, and returns the best match.
-func guessDelimiter(data []byte) rune {
-	maxLines := 10
-	text := quoteRegexp.ReplaceAllLiteralString(string(data), "")
-	lines := strings.SplitN(text, "\n", maxLines+1)
-	lines = lines[:util.Min(maxLines, len(lines))]
-
-	delimiters := []rune{',', ';', '\t', '|', '@'}
-	bestDelim := delimiters[0]
-	bestScore := 0.0
-	for _, delim := range delimiters {
-		score := scoreDelimiter(lines, delim)
-		if score > bestScore {
-			bestScore = score
-			bestDelim = delim
-		}
+// determineDelimiter takes a RenderContext and if it isn't nil and the Filename has an extension that specifies the delimiter,
+// it is used as the delimiter. Otherwise we call guessDelimiter with the data passed
+func determineDelimiter(ctx *markup.RenderContext, data []byte) rune {
+	extension := ".csv"
+	if ctx != nil {
+		extension = strings.ToLower(filepath.Ext(ctx.Filename))
+	}
+
+	var delimiter rune
+	switch extension {
+	case ".tsv":
+		delimiter = '\t'
+	case ".psv":
+		delimiter = '|'
+	default:
+		delimiter = guessDelimiter(data)
 	}
 
-	return bestDelim
+	return delimiter
 }
 
-// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV.
-func scoreDelimiter(lines []string, delim rune) float64 {
-	countTotal := 0
-	countLineMax := 0
-	linesNotEqual := 0
+// quoteRegexp follows the RFC-4180 CSV standard for when double-quotes are used to enclose fields, then a double-quote appearing inside a
+// field must be escaped by preceding it with another double quote. https://www.ietf.org/rfc/rfc4180.txt
+// This finds all quoted strings that have escaped quotes.
+var quoteRegexp = regexp.MustCompile(`"[^"]*"`)
 
-	for _, line := range lines {
-		if len(line) == 0 {
-			continue
-		}
+// removeQuotedStrings uses the quoteRegexp to remove all quoted strings so that we can realiably have each row on one line
+// (quoted strings often have new lines within the string)
+func removeQuotedString(text string) string {
+	return quoteRegexp.ReplaceAllLiteralString(text, "")
+}
 
-		countLine := strings.Count(line, string(delim))
-		countTotal += countLine
-		if countLine != countLineMax {
-			if countLineMax != 0 {
-				linesNotEqual++
-			}
-			countLineMax = util.Max(countLine, countLineMax)
-		}
+// guessDelimiter takes up to maxLines of the CSV text, iterates through the possible delimiters, and sees if the CSV Reader reads it without throwing any errors.
+// If more than one delmiiter passes, the delimiter that results in the most columns is returned.
+func guessDelimiter(data []byte) rune {
+	delimiter := guessFromBeforeAfterQuotes(data)
+	if delimiter != 0 {
+		return delimiter
 	}
 
-	return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
+	// Removes quoted values so we don't have columns with new lines in them
+	text := removeQuotedString(string(data))
+
+	// Make the text just be maxLines or less without truncated lines
+	lines := strings.SplitN(text, "\n", maxLines+1) // Will contain at least one line, and if there are more than MaxLines, the last item holds the rest of the lines
+	if len(lines) > maxLines {
+		// If the length of lines is > maxLines we know we have the max number of lines, trim it to maxLines
+		lines = lines[:maxLines]
+	} else if len(lines) > 1 && len(data) >= guessSampleSize {
+		// Even with data >= guessSampleSize, we don't have maxLines + 1 (no extra lines, must have really long lines)
+		// thus the last line is probably have a truncated line. Drop the last line if len(lines) > 1
+		lines = lines[:len(lines)-1]
+	}
+
+	// Put lines back together as a string
+	text = strings.Join(lines, "\n")
+
+	delimiters := []rune{',', '\t', ';', '|', '@'}
+	validDelim := delimiters[0]
+	validDelimColCount := 0
+	for _, delim := range delimiters {
+		csvReader := stdcsv.NewReader(strings.NewReader(text))
+		csvReader.Comma = delim
+		if rows, err := csvReader.ReadAll(); err == nil && len(rows) > 0 && len(rows[0]) > validDelimColCount {
+			validDelim = delim
+			validDelimColCount = len(rows[0])
+		}
+	}
+	return validDelim
 }
 
 // FormatError converts csv errors into readable messages.
 func FormatError(err error, locale translation.Locale) (string, error) {
-	var perr *stdcsv.ParseError
-	if errors.As(err, &perr) {
+	if perr, ok := err.(*stdcsv.ParseError); ok {
 		if perr.Err == stdcsv.ErrFieldCount {
 			return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil
 		}
@@ -102,3 +129,20 @@ func FormatError(err error, locale translation.Locale) (string, error) {
 
 	return "", err
 }
+
+// Looks for possible delimiters right before or after (with spaces after the former) double quotes with closing quotes
+var beforeAfterQuotes = regexp.MustCompile(`([,@\t;|]{0,1}) *(?:"[^"]*")+([,@\t;|]{0,1})`)
+
+// guessFromBeforeAfterQuotes guesses the limiter by finding a double quote that has a valid delimiter before it and a closing quote,
+// or a double quote with a closing quote and a valid delimiter after it
+func guessFromBeforeAfterQuotes(data []byte) rune {
+	rs := beforeAfterQuotes.FindStringSubmatch(string(data))
+	if rs != nil {
+		if rs[1] != "" {
+			return rune(rs[1][0])
+		} else if rs[2] != "" {
+			return rune(rs[2][0])
+		}
+	}
+	return 0
+}