From d19fcf48b429f99bf4613ac6b37222584ac07bc9 Mon Sep 17 00:00:00 2001 From: zemnmez Date: Wed, 13 Jul 2022 17:43:01 -0700 Subject: [PATCH] Speed up csvpretty --- go/cmd/csvpretty/csvpretty.go | 223 +++++++++++++++++++++++++++------- go/cmd/csvpretty/test_out.csv | 1 - 2 files changed, 182 insertions(+), 42 deletions(-) diff --git a/go/cmd/csvpretty/csvpretty.go b/go/cmd/csvpretty/csvpretty.go index 2dc01908df..a66697f6c0 100644 --- a/go/cmd/csvpretty/csvpretty.go +++ b/go/cmd/csvpretty/csvpretty.go @@ -3,7 +3,9 @@ package main import ( "bytes" "encoding/csv" + "errors" "flag" + "fmt" "io" "io/ioutil" "os" @@ -26,6 +28,46 @@ func main() { panic(err) } +type lineWhitespaceTrimmer struct { + out io.Writer + currentLine []byte +} + +func (l lineWhitespaceTrimmer) trimmedCurrentLine() []byte { + return bytes.TrimSpace(l.currentLine) +} + +func (l *lineWhitespaceTrimmer) Flush() (err error) { + _, err = l.out.Write(l.trimmedCurrentLine()) + l.currentLine = nil + + return +} + +func (l *lineWhitespaceTrimmer) Close() (err error) { + l.Flush() + return nil +} + +func (l *lineWhitespaceTrimmer) Write(b []byte) (n int, err error) { + l.currentLine = append(l.currentLine, b...) + + for bytes.Contains(l.currentLine, []byte("\n")) { + splits := bytes.SplitN(l.currentLine, []byte("\n"), 2) + l.currentLine = splits[1] + _, err = l.out.Write(append(bytes.TrimSpace(splits[0]), []byte("\n")...)) + + if err != nil { + err = fmt.Errorf("lineWhitespaceTrimmer: %w", err) + return + } + } + + n = len(b) + + return +} + type byteReplacer struct { out io.Writer from byte @@ -34,6 +76,9 @@ type byteReplacer struct { func (br byteReplacer) Write(b []byte) (n int, err error) { n, err = br.out.Write(bytes.Replace(b, []byte{br.from}, []byte(br.to), -1)) + if err != nil { + err = fmt.Errorf("byteReplacer: %w", err) + } return len(b), err } @@ -41,12 +86,14 @@ var input string var output string var overwrite bool var debug bool +var validate bool func init() { flag.StringVar(&input, "input", "", "input file") flag.StringVar(&output, "output", "", "output file") flag.BoolVar(&overwrite, "w", false, "overwrite input with output") flag.BoolVar(&debug, "debug", false, "print debug info") + flag.BoolVar(&validate, "validate", false, "Validate the number of fields is the same on every row.") } const holder = '\x01' @@ -57,44 +104,63 @@ func (e errUsage) Error() string { return string(e) } var missingInput errUsage = "missing input" -func do() (err error) { - flag.Parse() +type byteWriteCounter struct { + out io.Writer + ctr uint64 +} - if input == "" { - return missingInput - } +func (b *byteWriteCounter) Write(bt []byte) (n int, err error) { + n, err = b.out.Write(bt) - if overwrite { - output = input - } + b.ctr += uint64(n) - var bt []byte - if bt, err = ioutil.ReadFile(input); err != nil { - return + if err != nil { + err = fmt.Errorf("byteWriteCounter: %w", err) } - var records [][]string - if records, err = csv.NewReader(bytes.NewReader(bt)).ReadAll(); err != nil { - return - } + return +} - var buf bytes.Buffer - var tabFlags uint = 0 +type PrettyCSV struct { + rd io.ReadCloser + debug bool +} + +func (p PrettyCSV) Read(b []byte) (n int, err error) { + panic("this is secretly not a reader! Please use WriteTo()") +} + +func (p PrettyCSV) Close() (err error) { + return p.rd.Close() +} + +func (p PrettyCSV) WriteTo(w io.Writer) (n int64, err error) { var padChr byte = ' ' - if debug { + + var tabFlags uint = 0 + if p.debug { tabFlags |= tabwriter.Debug padChr = '-' } - tabWriter := tabwriter.NewWriter(&buf, 0, 1, 3, padChr, tabFlags) - var tabReplacer = byteReplacer{ + // This is a pipeline of + // csv reader -> csv writer -> lineReplacer -> tabReplacer -> tabWriter -> lineWhitespaceTrimmer -> byteWriteCounter -> out + + ctr := &byteWriteCounter{out: w} + + trimmer := &lineWhitespaceTrimmer{ + out: ctr, + } + + tabWriter := tabwriter.NewWriter(trimmer, 0, 1, 3, padChr, tabFlags) + + tabReplacer := byteReplacer{ out: tabWriter, from: holder, to: ",\t", } - // tabwriter wants a \t at the end of each row too. - var lineReplacer = byteReplacer{ + lineReplacer := byteReplacer{ out: tabReplacer, from: '\n', to: "\t\n", @@ -103,49 +169,124 @@ func do() (err error) { csvWriter := csv.NewWriter(lineReplacer) csvWriter.Comma = holder - for _, row := range records { + csvRd := csv.NewReader(p.rd) + + for { + var row []string + row, err = csvRd.Read() + + if errors.Is(err, csv.ErrFieldCount) && !validate { + err = nil + } + + if err != nil { + if errors.Is(err, io.EOF) { + break + } + + err = fmt.Errorf("Reading CSV row: %w", err) + return + } + + // strip any parsed spaces for i, f := range row { - // strip any parsed spaces (which i'd like to be ignored) row[i] = strings.TrimSpace(f) } + if err = csvWriter.Write(row); err != nil { + err = fmt.Errorf("Writing CSV row: %w", err) return } } csvWriter.Flush() - if _, err = tabWriter.Write([]byte("\t")); err != nil { + + if err = csvWriter.Error(); err != nil { + err = fmt.Errorf("CSV flush: %w", err) + return + } + + if _, err = tabWriter.Write([]byte{'\t'}); err != nil { + err = fmt.Errorf("Writing final tab: %w", err) return } if err = tabWriter.Flush(); err != nil { + err = fmt.Errorf("Flushing tabWriter: %w", err) return } - // trim extra space around each line - var trimmed bytes.Buffer - for _, line := range bytes.Split(buf.Bytes(), []byte("\n")) { - if _, err = trimmed.Write(append(bytes.TrimSpace(line), []byte("\n")...)); err != nil { - return - } + if err = trimmer.Flush(); err != nil { + err = fmt.Errorf("Flushing line trimmer: %w", err) + return } - var outfile io.Writer = os.Stdout + return int64(ctr.ctr), nil +} - if output != "" && output != "-" { - var f *os.File - if f, err = os.OpenFile(output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0700); err != nil { - return - } +func do() (err error) { + flag.Parse() + + if input == "" { + return missingInput + } + + if overwrite { + output = input + } + + inputFile, err := os.Open(input) + + defer inputFile.Close() - defer f.Close() - outfile = f + if err != nil { + err = fmt.Errorf("Opening input file: %w", err) + return } - if _, err = io.Copy(outfile, &trimmed); err != nil { + tempOut, err := ioutil.TempFile("", "csvpretty") + defer os.Remove(tempOut.Name()) + + if err != nil { + err = fmt.Errorf("Creating temporary file: %w", err) return } - return + _, err = io.Copy(tempOut, PrettyCSV{rd: inputFile, debug: debug}) + if err != nil { + err = fmt.Errorf("Copying pretty CSV to out: %w", err) + return + } + + if err = inputFile.Close(); err != nil { + err = fmt.Errorf("Closing input file: %w", err) + return + } + + outputFile, err := os.OpenFile(output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0700) + defer outputFile.Close() + + if err != nil { + err = fmt.Errorf("Opening output file: %w", err) + return + } + + // return to beginning of the file + _, err = tempOut.Seek(0, 0) + + if err != nil { + err = fmt.Errorf("Seeking to beginning of temp file: %w", err) + return + } + + _, err = io.Copy(outputFile, tempOut) + + if err != nil { + err = fmt.Errorf("Copying temp file to output: %w", err) + return + } + + outputFile.Close() + return nil } diff --git a/go/cmd/csvpretty/test_out.csv b/go/cmd/csvpretty/test_out.csv index 3b02012ada..4ff1f29538 100644 --- a/go/cmd/csvpretty/test_out.csv +++ b/go/cmd/csvpretty/test_out.csv @@ -2,4 +2,3 @@ Name, Subspecies, Species West African Giraffe, G. c. peralta, G. camelopardalis Meerkat, , s.Suricata Dog, C. l. familiaris, C. lupus -