Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up csvpretty #553

Merged
merged 2 commits into from
Jul 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
223 changes: 182 additions & 41 deletions go/cmd/csvpretty/csvpretty.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ package main
import (
"bytes"
"encoding/csv"
"errors"
"flag"
"fmt"
"io"
"io/ioutil"
"os"
Expand All @@ -26,6 +28,46 @@ func main() {
panic(err)
}

type lineWhitespaceTrimmer struct {
out io.Writer
currentLine []byte
}

func (l lineWhitespaceTrimmer) trimmedCurrentLine() []byte {
return bytes.TrimSpace(l.currentLine)
}

func (l *lineWhitespaceTrimmer) Flush() (err error) {
_, err = l.out.Write(l.trimmedCurrentLine())
l.currentLine = nil

return
}

func (l *lineWhitespaceTrimmer) Close() (err error) {
l.Flush()
return nil
}

func (l *lineWhitespaceTrimmer) Write(b []byte) (n int, err error) {
l.currentLine = append(l.currentLine, b...)

for bytes.Contains(l.currentLine, []byte("\n")) {
splits := bytes.SplitN(l.currentLine, []byte("\n"), 2)
l.currentLine = splits[1]
_, err = l.out.Write(append(bytes.TrimSpace(splits[0]), []byte("\n")...))

if err != nil {
err = fmt.Errorf("lineWhitespaceTrimmer: %w", err)
return
}
}

n = len(b)

return
}

type byteReplacer struct {
out io.Writer
from byte
Expand All @@ -34,19 +76,24 @@ type byteReplacer struct {

func (br byteReplacer) Write(b []byte) (n int, err error) {
n, err = br.out.Write(bytes.Replace(b, []byte{br.from}, []byte(br.to), -1))
if err != nil {
err = fmt.Errorf("byteReplacer: %w", err)
}
return len(b), err
}

var input string
var output string
var overwrite bool
var debug bool
var validate bool

func init() {
flag.StringVar(&input, "input", "", "input file")
flag.StringVar(&output, "output", "", "output file")
flag.BoolVar(&overwrite, "w", false, "overwrite input with output")
flag.BoolVar(&debug, "debug", false, "print debug info")
flag.BoolVar(&validate, "validate", false, "Validate the number of fields is the same on every row.")
}

const holder = '\x01'
Expand All @@ -57,44 +104,63 @@ func (e errUsage) Error() string { return string(e) }

var missingInput errUsage = "missing input"

func do() (err error) {
flag.Parse()
type byteWriteCounter struct {
out io.Writer
ctr uint64
}

if input == "" {
return missingInput
}
func (b *byteWriteCounter) Write(bt []byte) (n int, err error) {
n, err = b.out.Write(bt)

if overwrite {
output = input
}
b.ctr += uint64(n)

var bt []byte
if bt, err = ioutil.ReadFile(input); err != nil {
return
if err != nil {
err = fmt.Errorf("byteWriteCounter: %w", err)
}

var records [][]string
if records, err = csv.NewReader(bytes.NewReader(bt)).ReadAll(); err != nil {
return
}
return
}

var buf bytes.Buffer
var tabFlags uint = 0
type PrettyCSV struct {
rd io.ReadCloser
debug bool
}

func (p PrettyCSV) Read(b []byte) (n int, err error) {
panic("this is secretly not a reader! Please use WriteTo()")
}

func (p PrettyCSV) Close() (err error) {
return p.rd.Close()
}

func (p PrettyCSV) WriteTo(w io.Writer) (n int64, err error) {
var padChr byte = ' '
if debug {

var tabFlags uint = 0
if p.debug {
tabFlags |= tabwriter.Debug
padChr = '-'
}
tabWriter := tabwriter.NewWriter(&buf, 0, 1, 3, padChr, tabFlags)

var tabReplacer = byteReplacer{
// This is a pipeline of
// csv reader -> csv writer -> lineReplacer -> tabReplacer -> tabWriter -> lineWhitespaceTrimmer -> byteWriteCounter -> out

ctr := &byteWriteCounter{out: w}

trimmer := &lineWhitespaceTrimmer{
out: ctr,
}

tabWriter := tabwriter.NewWriter(trimmer, 0, 1, 3, padChr, tabFlags)

tabReplacer := byteReplacer{
out: tabWriter,
from: holder,
to: ",\t",
}

// tabwriter wants a \t at the end of each row too.
var lineReplacer = byteReplacer{
lineReplacer := byteReplacer{
out: tabReplacer,
from: '\n',
to: "\t\n",
Expand All @@ -103,49 +169,124 @@ func do() (err error) {
csvWriter := csv.NewWriter(lineReplacer)
csvWriter.Comma = holder

for _, row := range records {
csvRd := csv.NewReader(p.rd)

for {
var row []string
row, err = csvRd.Read()

if errors.Is(err, csv.ErrFieldCount) && !validate {
err = nil
}

if err != nil {
if errors.Is(err, io.EOF) {
break
}

err = fmt.Errorf("Reading CSV row: %w", err)
return
}

// strip any parsed spaces
for i, f := range row {
// strip any parsed spaces (which i'd like to be ignored)
row[i] = strings.TrimSpace(f)
}

if err = csvWriter.Write(row); err != nil {
err = fmt.Errorf("Writing CSV row: %w", err)
return
}
}

csvWriter.Flush()
if _, err = tabWriter.Write([]byte("\t")); err != nil {

if err = csvWriter.Error(); err != nil {
err = fmt.Errorf("CSV flush: %w", err)
return
}

if _, err = tabWriter.Write([]byte{'\t'}); err != nil {
err = fmt.Errorf("Writing final tab: %w", err)
return
}

if err = tabWriter.Flush(); err != nil {
err = fmt.Errorf("Flushing tabWriter: %w", err)
return
}

// trim extra space around each line
var trimmed bytes.Buffer
for _, line := range bytes.Split(buf.Bytes(), []byte("\n")) {
if _, err = trimmed.Write(append(bytes.TrimSpace(line), []byte("\n")...)); err != nil {
return
}
if err = trimmer.Flush(); err != nil {
err = fmt.Errorf("Flushing line trimmer: %w", err)
return
}

var outfile io.Writer = os.Stdout
return int64(ctr.ctr), nil
}

if output != "" && output != "-" {
var f *os.File
if f, err = os.OpenFile(output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0700); err != nil {
return
}
func do() (err error) {
flag.Parse()

if input == "" {
return missingInput
}

if overwrite {
output = input
}

inputFile, err := os.Open(input)

defer inputFile.Close()

defer f.Close()
outfile = f
if err != nil {
err = fmt.Errorf("Opening input file: %w", err)
return
}

if _, err = io.Copy(outfile, &trimmed); err != nil {
tempOut, err := ioutil.TempFile("", "csvpretty")
defer os.Remove(tempOut.Name())

if err != nil {
err = fmt.Errorf("Creating temporary file: %w", err)
return
}

return
_, err = io.Copy(tempOut, PrettyCSV{rd: inputFile, debug: debug})
if err != nil {
err = fmt.Errorf("Copying pretty CSV to out: %w", err)
return
}

if err = inputFile.Close(); err != nil {
err = fmt.Errorf("Closing input file: %w", err)
return
}

outputFile, err := os.OpenFile(output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0700)
defer outputFile.Close()

if err != nil {
err = fmt.Errorf("Opening output file: %w", err)
return
}

// return to beginning of the file
_, err = tempOut.Seek(0, 0)

if err != nil {
err = fmt.Errorf("Seeking to beginning of temp file: %w", err)
return
}

_, err = io.Copy(outputFile, tempOut)

if err != nil {
err = fmt.Errorf("Copying temp file to output: %w", err)
return
}

outputFile.Close()

return nil
}
1 change: 0 additions & 1 deletion go/cmd/csvpretty/test_out.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@ Name, Subspecies, Species
West African Giraffe, G. c. peralta, G. camelopardalis
Meerkat, , s.Suricata
Dog, C. l. familiaris, C. lupus