diff --git a/go.mod b/go.mod index f7322a8120f78..0e50b727a7d45 100644 --- a/go.mod +++ b/go.mod @@ -130,7 +130,7 @@ require ( github.com/influxdata/tdigest v0.0.2-0.20210216194612-fc98d27c9e8b github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db github.com/ncw/swift/v2 v2.0.3 - github.com/parquet-go/parquet-go v0.24.0 + github.com/parquet-go/parquet-go v0.25.0 github.com/prometheus/alertmanager v0.28.0 github.com/prometheus/common/sigv4 v0.1.0 github.com/prometheus/sigv4 v0.1.2 diff --git a/go.sum b/go.sum index b1ccf704727a6..542711506f192 100644 --- a/go.sum +++ b/go.sum @@ -979,8 +979,8 @@ github.com/oschwald/maxminddb-golang v1.13.0/go.mod h1:BU0z8BfFVhi1LQaonTwwGQlsH github.com/ovh/go-ovh v1.6.0 h1:ixLOwxQdzYDx296sXcgS35TOPEahJkpjMGtzPadCjQI= github.com/ovh/go-ovh v1.6.0/go.mod h1:cTVDnl94z4tl8pP1uZ/8jlVxntjSIf09bNcQ5TJSC7c= github.com/pact-foundation/pact-go v1.0.4/go.mod h1:uExwJY4kCzNPcHRj+hCR/HBbOOIwwtUjcrb0b5/5kLM= -github.com/parquet-go/parquet-go v0.24.0 h1:VrsifmLPDnas8zpoHmYiWDZ1YHzLmc7NmNwPGkI2JM4= -github.com/parquet-go/parquet-go v0.24.0/go.mod h1:OqBBRGBl7+llplCvDMql8dEKaDqjaFA/VAPw+OJiNiw= +github.com/parquet-go/parquet-go v0.25.0 h1:GwKy11MuF+al/lV6nUsFw8w8HCiPOSAx1/y8yFxjH5c= +github.com/parquet-go/parquet-go v0.25.0/go.mod h1:OqBBRGBl7+llplCvDMql8dEKaDqjaFA/VAPw+OJiNiw= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= diff --git a/vendor/github.com/parquet-go/parquet-go/.gitignore b/vendor/github.com/parquet-go/parquet-go/.gitignore index c3ca928813ae1..b3584c8d4de0e 100644 --- a/vendor/github.com/parquet-go/parquet-go/.gitignore +++ b/vendor/github.com/parquet-go/parquet-go/.gitignore @@ -4,6 +4,7 @@ *.dll *.so *.dylib +*.py # Test binary, built with `go test -c` *.test diff --git a/vendor/github.com/parquet-go/parquet-go/CODEOWNERS b/vendor/github.com/parquet-go/parquet-go/CODEOWNERS index 9a95c2a07c73e..45632dfe361ab 100644 --- a/vendor/github.com/parquet-go/parquet-go/CODEOWNERS +++ b/vendor/github.com/parquet-go/parquet-go/CODEOWNERS @@ -1 +1 @@ -* @achille-roussel @asubiotto @brancz @fpetkovski @joe-elliott @kevinburkesegment @mdisibio @metalmatze @Pryz @thorfour \ No newline at end of file +* @achille-roussel @fpetkovski @joe-elliott @thorfour diff --git a/vendor/github.com/parquet-go/parquet-go/README.md b/vendor/github.com/parquet-go/parquet-go/README.md index 94bbdefad5b6a..49341ae5aa27a 100644 --- a/vendor/github.com/parquet-go/parquet-go/README.md +++ b/vendor/github.com/parquet-go/parquet-go/README.md @@ -310,7 +310,7 @@ if err != nil { } writer := parquet.NewGenericWriter[RowType](output) -_, err := parquet.CopyRows(writer, merge) +_, err := parquet.CopyRows(writer, merge.Rows()) if err != nil { ... } diff --git a/vendor/github.com/parquet-go/parquet-go/allocator.go b/vendor/github.com/parquet-go/parquet-go/allocator.go index 693ee5a2471d1..d1500643b3c52 100644 --- a/vendor/github.com/parquet-go/parquet-go/allocator.go +++ b/vendor/github.com/parquet-go/parquet-go/allocator.go @@ -51,7 +51,7 @@ func (a *allocator) reset() { // contracts that do not allow the implementations to retain the rows they // are passed as arguments. // -// See: RowBuffer, DedupeRowReader, DedupeRowWriter +// See: RowBuffer, NewRowGroupRowReader, NewColumnChunkRowReader type rowAllocator struct{ allocator } func (a *rowAllocator) capture(row Row) { diff --git a/vendor/github.com/parquet-go/parquet-go/bloom.go b/vendor/github.com/parquet-go/parquet-go/bloom.go index 30c64b8486150..911de082122ee 100644 --- a/vendor/github.com/parquet-go/parquet-go/bloom.go +++ b/vendor/github.com/parquet-go/parquet-go/bloom.go @@ -31,13 +31,19 @@ type BloomFilter interface { Check(value Value) (bool, error) } -type bloomFilter struct { +type errorBloomFilter struct{ err error } + +func (f *errorBloomFilter) Size() int64 { return 0 } +func (f *errorBloomFilter) ReadAt([]byte, int64) (int, error) { return 0, f.err } +func (f *errorBloomFilter) Check(Value) (bool, error) { return false, f.err } + +type FileBloomFilter struct { io.SectionReader hash bloom.Hash check func(io.ReaderAt, int64, uint64) (bool, error) } -func (f *bloomFilter) Check(v Value) (bool, error) { +func (f *FileBloomFilter) Check(v Value) (bool, error) { return f.check(&f.SectionReader, f.Size(), v.hash(f.hash)) } @@ -54,11 +60,11 @@ func (v Value) hash(h bloom.Hash) uint64 { } } -func newBloomFilter(file io.ReaderAt, offset int64, header *format.BloomFilterHeader) *bloomFilter { +func newBloomFilter(file io.ReaderAt, offset int64, header *format.BloomFilterHeader) *FileBloomFilter { if header.Algorithm.Block != nil { if header.Hash.XxHash != nil { if header.Compression.Uncompressed != nil { - return &bloomFilter{ + return &FileBloomFilter{ SectionReader: *io.NewSectionReader(file, offset, int64(header.NumBytes)), hash: bloom.XXH64{}, check: bloom.CheckSplitBlock, diff --git a/vendor/github.com/parquet-go/parquet-go/bloom/xxhash/sum64uint_amd64.s b/vendor/github.com/parquet-go/parquet-go/bloom/xxhash/sum64uint_amd64.s index da7f80e24902d..9d420a9d9100c 100644 --- a/vendor/github.com/parquet-go/parquet-go/bloom/xxhash/sum64uint_amd64.s +++ b/vendor/github.com/parquet-go/parquet-go/bloom/xxhash/sum64uint_amd64.s @@ -65,8 +65,8 @@ passing as a guarantee that they have not introduced regressions. #define prime1 R12 #define prime2 R13 #define prime3 R14 -#define prime4 R15 -#define prime5 R15 // same as prime4 because they are not used together +#define prime4 R11 +#define prime5 R11 // same as prime4 because they are not used together #define prime1ZMM Z12 #define prime2ZMM Z13 diff --git a/vendor/github.com/parquet-go/parquet-go/buffer.go b/vendor/github.com/parquet-go/parquet-go/buffer.go index ab5b4ea55c085..706184edbe7c7 100644 --- a/vendor/github.com/parquet-go/parquet-go/buffer.go +++ b/vendor/github.com/parquet-go/parquet-go/buffer.go @@ -298,10 +298,10 @@ func (buf *Buffer) NumRows() int64 { return int64(buf.Len()) } // ColumnChunks returns the buffer columns. func (buf *Buffer) ColumnChunks() []ColumnChunk { return buf.chunks } -// ColumnBuffer returns the buffer columns. +// ColumnBuffers returns the buffer columns. // // This method is similar to ColumnChunks, but returns a list of ColumnBuffer -// instead of a ColumnChunk values (the latter being read-only); calling +// instead of a list of ColumnChunk (the latter being read-only); calling // ColumnBuffers or ColumnChunks with the same index returns the same underlying // objects, but with different types, which removes the need for making a type // assertion if the program needed to write directly to the column buffers. @@ -414,7 +414,7 @@ func (buf *Buffer) WriteRowGroup(rowGroup RowGroup) (int64, error) { return 0, ErrRowGroupSchemaMissing case buf.schema == nil: buf.configure(rowGroupSchema) - case !nodesAreEqual(buf.schema, rowGroupSchema): + case !EqualNodes(buf.schema, rowGroupSchema): return 0, ErrRowGroupSchemaMismatch } if !sortingColumnsHavePrefix(rowGroup.SortingColumns(), buf.SortingColumns()) { @@ -431,7 +431,7 @@ func (buf *Buffer) WriteRowGroup(rowGroup RowGroup) (int64, error) { // // The buffer and the returned reader share memory. Mutating the buffer // concurrently to reading rows may result in non-deterministic behavior. -func (buf *Buffer) Rows() Rows { return newRowGroupRows(buf, ReadModeSync) } +func (buf *Buffer) Rows() Rows { return NewRowGroupRowReader(buf) } // bufferWriter is an adapter for Buffer which implements both RowWriter and // PageWriter to enable optimizations in CopyRows for types that support writing diff --git a/vendor/github.com/parquet-go/parquet-go/column.go b/vendor/github.com/parquet-go/parquet-go/column.go index 51f2d20a0971b..d3d29050d0a23 100644 --- a/vendor/github.com/parquet-go/parquet-go/column.go +++ b/vendor/github.com/parquet-go/parquet-go/column.go @@ -25,6 +25,7 @@ type Column struct { schema *format.SchemaElement order *format.ColumnOrder path columnPath + fields []Field columns []*Column chunks []*format.ColumnChunk columnIndex []*format.ColumnIndex @@ -56,13 +57,7 @@ func (c *Column) Required() bool { return schemaRepetitionTypeOf(c.schema) == fo func (c *Column) Leaf() bool { return c.index >= 0 } // Fields returns the list of fields on the column. -func (c *Column) Fields() []Field { - fields := make([]Field, len(c.columns)) - for i, column := range c.columns { - fields[i] = column - } - return fields -} +func (c *Column) Fields() []Field { return c.fields } // Encoding returns the encodings used by this column. func (c *Column) Encoding() encoding.Encoding { return c.encoding } @@ -97,20 +92,27 @@ func (c *Column) Column(name string) *Column { // Pages returns a reader exposing all pages in this column, across row groups. func (c *Column) Pages() Pages { - if c.index < 0 { + if c.file == nil { + return emptyPages{} + } + return c.PagesFrom(c.file.reader) +} + +func (c *Column) PagesFrom(reader io.ReaderAt) Pages { + if c.index < 0 || c.file == nil { return emptyPages{} } r := &columnPages{ - pages: make([]filePages, len(c.file.rowGroups)), + pages: make([]FilePages, len(c.file.rowGroups)), } for i := range r.pages { - r.pages[i].init(c.file.rowGroups[i].(*fileRowGroup).columns[c.index].(*fileColumnChunk)) + r.pages[i].init(c.file.rowGroups[i].(*FileRowGroup).columns[c.index].(*FileColumnChunk), reader) } return r } type columnPages struct { - pages []filePages + pages []FilePages index int } @@ -200,10 +202,10 @@ func (c *Column) forEachLeaf(do func(*Column)) { } } -func openColumns(file *File) (*Column, error) { +func openColumns(file *File, metadata *format.FileMetaData, columnIndexes []format.ColumnIndex, offsetIndexes []format.OffsetIndex) (*Column, error) { cl := columnLoader{} - c, err := cl.open(file, nil) + c, err := cl.open(file, metadata, columnIndexes, offsetIndexes, nil) if err != nil { return nil, err } @@ -211,7 +213,7 @@ func openColumns(file *File) (*Column, error) { // Validate that there aren't extra entries in the row group columns, // which would otherwise indicate that there are dangling data pages // in the file. - for index, rowGroup := range file.metadata.RowGroups { + for index, rowGroup := range metadata.RowGroups { if cl.rowGroupColumnIndex != len(rowGroup.Columns) { return nil, fmt.Errorf("row group at index %d contains %d columns but %d were referenced by the column schemas", index, len(rowGroup.Columns), cl.rowGroupColumnIndex) @@ -271,10 +273,10 @@ type columnLoader struct { rowGroupColumnIndex int } -func (cl *columnLoader) open(file *File, path []string) (*Column, error) { +func (cl *columnLoader) open(file *File, metadata *format.FileMetaData, columnIndexes []format.ColumnIndex, offsetIndexes []format.OffsetIndex, path []string) (*Column, error) { c := &Column{ file: file, - schema: &file.metadata.Schema[cl.schemaIndex], + schema: &metadata.Schema[cl.schemaIndex], } c.path = columnPath(path).append(c.schema.Name) @@ -284,12 +286,12 @@ func (cl *columnLoader) open(file *File, path []string) (*Column, error) { if numChildren == 0 { c.typ = schemaElementTypeOf(c.schema) - if cl.columnOrderIndex < len(file.metadata.ColumnOrders) { - c.order = &file.metadata.ColumnOrders[cl.columnOrderIndex] + if cl.columnOrderIndex < len(metadata.ColumnOrders) { + c.order = &metadata.ColumnOrders[cl.columnOrderIndex] cl.columnOrderIndex++ } - rowGroups := file.metadata.RowGroups + rowGroups := metadata.RowGroups rowGroupColumnIndex := cl.rowGroupColumnIndex cl.rowGroupColumnIndex++ @@ -304,21 +306,21 @@ func (cl *columnLoader) open(file *File, path []string) (*Column, error) { c.chunks = append(c.chunks, &rowGroup.Columns[rowGroupColumnIndex]) } - if len(file.columnIndexes) > 0 { + if len(columnIndexes) > 0 { for i := range rowGroups { - if rowGroupColumnIndex >= len(file.columnIndexes) { + if rowGroupColumnIndex >= len(columnIndexes) { return nil, fmt.Errorf("row group at index %d does not have enough column index pages", i) } - c.columnIndex = append(c.columnIndex, &file.columnIndexes[rowGroupColumnIndex]) + c.columnIndex = append(c.columnIndex, &columnIndexes[rowGroupColumnIndex]) } } - if len(file.offsetIndexes) > 0 { + if len(offsetIndexes) > 0 { for i := range rowGroups { - if rowGroupColumnIndex >= len(file.offsetIndexes) { + if rowGroupColumnIndex >= len(offsetIndexes) { return nil, fmt.Errorf("row group at index %d does not have enough offset index pages", i) } - c.offsetIndex = append(c.offsetIndex, &file.offsetIndexes[rowGroupColumnIndex]) + c.offsetIndex = append(c.offsetIndex, &offsetIndexes[rowGroupColumnIndex]) } } @@ -354,22 +356,28 @@ func (cl *columnLoader) open(file *File, path []string) (*Column, error) { c.typ = &mapType{} } else if lt != nil && lt.List != nil { c.typ = &listType{} + } else if lt != nil && lt.Variant != nil { + c.typ = &variantType{} } c.columns = make([]*Column, numChildren) for i := range c.columns { - if cl.schemaIndex >= len(file.metadata.Schema) { + if cl.schemaIndex >= len(metadata.Schema) { return nil, fmt.Errorf("column %q has more children than there are schemas in the file: %d > %d", - c.schema.Name, cl.schemaIndex+1, len(file.metadata.Schema)) + c.schema.Name, cl.schemaIndex+1, len(metadata.Schema)) } var err error - c.columns[i], err = cl.open(file, c.path) + c.columns[i], err = cl.open(file, metadata, columnIndexes, offsetIndexes, c.path) if err != nil { return nil, fmt.Errorf("%s: %w", c.schema.Name, err) } } + c.fields = make([]Field, len(c.columns)) + for i, column := range c.columns { + c.fields[i] = column + } return c, nil } @@ -633,6 +641,11 @@ func (c *Column) decodeDataPageV2(header DataPageHeaderV2, page *buffer, dict Di } if repetitionLevels != nil { defer repetitionLevels.unref() + + if len(repetitionLevels.data) != 0 && repetitionLevels.data[0] != 0 { + return nil, fmt.Errorf("%w: first repetition level for column %d (%s) is %d instead of zero, indicating that the page contains trailing values from the previous page (this is forbidden for data pages v2)", + ErrMalformedRepetitionLevel, c.Index(), c.Name(), repetitionLevels.data[0]) + } } } diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer.go b/vendor/github.com/parquet-go/parquet-go/column_buffer.go index d1bc339862d82..4755f25836993 100644 --- a/vendor/github.com/parquet-go/parquet-go/column_buffer.go +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer.go @@ -1629,8 +1629,11 @@ func (col *fixedLenByteArrayColumnBuffer) Write(b []byte) (int, error) { } func (col *fixedLenByteArrayColumnBuffer) WriteFixedLenByteArrays(values []byte) (int, error) { + if len(values) == 0 { + return 0, nil + } d, m := len(values)/col.size, len(values)%col.size - if m != 0 { + if d == 0 || m != 0 { return 0, fmt.Errorf("cannot write FIXED_LEN_BYTE_ARRAY values of size %d from input of size %d", col.size, len(values)) } col.data = append(col.data, values...) @@ -1638,7 +1641,10 @@ func (col *fixedLenByteArrayColumnBuffer) WriteFixedLenByteArrays(values []byte) } func (col *fixedLenByteArrayColumnBuffer) WriteValues(values []Value) (int, error) { - for _, v := range values { + for i, v := range values { + if n := len(v.byteArray()); n != col.size { + return i, fmt.Errorf("cannot write FIXED_LEN_BYTE_ARRAY values of size %d from input of size %d", col.size, n) + } col.data = append(col.data, v.byteArray()...) } return len(values), nil @@ -2024,7 +2030,7 @@ func writeRowsFuncOf(t reflect.Type, schema *Schema, path columnPath) writeRowsF case reflect.Array: if t.Elem().Kind() == reflect.Uint8 { - return writeRowsFuncOfRequired(t, schema, path) + return writeRowsFuncOfArray(t, schema, path) } case reflect.Pointer: @@ -2041,7 +2047,7 @@ func writeRowsFuncOf(t reflect.Type, schema *Schema, path columnPath) writeRowsF } func writeRowsFuncOfRequired(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { - column := schema.mapping.lookup(path) + column := schema.lazyLoadState().mapping.lookup(path) columnIndex := column.columnIndex return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { columns[columnIndex].writeValues(rows, levels) @@ -2050,6 +2056,15 @@ func writeRowsFuncOfRequired(t reflect.Type, schema *Schema, path columnPath) wr } func writeRowsFuncOfOptional(t reflect.Type, schema *Schema, path columnPath, writeRows writeRowsFunc) writeRowsFunc { + if t.Kind() == reflect.Slice { // assume nested list + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + return writeRows(columns, rows, levels) + } + levels.definitionLevel++ + return writeRows(columns, rows, levels) + } + } nullIndex := nullIndexFuncOf(t) return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { if rows.Len() == 0 { @@ -2148,6 +2163,16 @@ func writeRowsFuncOfOptional(t reflect.Type, schema *Schema, path columnPath, wr } } +func writeRowsFuncOfArray(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { + column := schema.lazyLoadState().mapping.lookup(path) + arrayLen := t.Len() + columnLen := column.node.Type().Length() + if arrayLen != columnLen { + panic(fmt.Sprintf("cannot convert Go values of type "+typeNameOf(t)+" to FIXED_LEN_BYTE_ARRAY(%d)", columnLen)) + } + return writeRowsFuncOfRequired(t, schema, path) +} + func writeRowsFuncOfPointer(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { elemType := t.Elem() elemSize := uintptr(elemType.Size()) @@ -2258,11 +2283,12 @@ func writeRowsFuncOfStruct(t reflect.Type, schema *Schema, path columnPath) writ columns := make([]column, len(fields)) for i, f := range fields { - optional := false + list, optional := false, false columnPath := path.append(f.Name) forEachStructTagOption(f, func(_ reflect.Type, option, _ string) { switch option { case "list": + list = true columnPath = columnPath.append("list", "element") case "optional": optional = true @@ -2271,8 +2297,10 @@ func writeRowsFuncOfStruct(t reflect.Type, schema *Schema, path columnPath) writ writeRows := writeRowsFuncOf(f.Type, schema, columnPath) if optional { - switch f.Type.Kind() { - case reflect.Pointer, reflect.Slice: + kind := f.Type.Kind() + switch { + case kind == reflect.Pointer: + case kind == reflect.Slice && !list: default: writeRows = writeRowsFuncOfOptional(f.Type, schema, columnPath, writeRows) } diff --git a/vendor/github.com/parquet-go/parquet-go/column_chunk.go b/vendor/github.com/parquet-go/parquet-go/column_chunk.go index ad1afb2f0f8c4..56dd35680029c 100644 --- a/vendor/github.com/parquet-go/parquet-go/column_chunk.go +++ b/vendor/github.com/parquet-go/parquet-go/column_chunk.go @@ -6,6 +6,7 @@ import ( ) var ( + ErrMissingBloomFilter = errors.New("missing bloom filter") ErrMissingColumnIndex = errors.New("missing column index") ErrMissingOffsetIndex = errors.New("missing offset index") ) @@ -48,6 +49,113 @@ type ColumnChunk interface { NumValues() int64 } +// AsyncColumnChunk returns a ColumnChunk that reads pages asynchronously. +func AsyncColumnChunk(columnChunk ColumnChunk) ColumnChunk { + return &asyncColumnChunk{columnChunk} +} + +type asyncColumnChunk struct { + ColumnChunk +} + +func (c *asyncColumnChunk) Pages() Pages { + return AsyncPages(c.ColumnChunk.Pages()) +} + +// NewColumnChunkRowReader creates a new ColumnChunkRowReader for the given +// column chunks. +func NewColumnChunkRowReader(columns []ColumnChunk) RowReadSeekCloser { + return newRowGroupRows(nil, columns, defaultValueBufferSize) +} + +// ColumnChunkValueReader is an interface for reading values from a column chunk. +type ColumnChunkValueReader interface { + ValueReader + RowSeeker + io.Closer +} + +// NewColumnChunkValueReader creates a new ColumnChunkValueReader for the given +// column chunk. +func NewColumnChunkValueReader(column ColumnChunk) ColumnChunkValueReader { + return &columnChunkValueReader{pages: column.Pages(), release: Release} +} + +type columnChunkValueReader struct { + pages Pages + page Page + values ValueReader + release func(Page) +} + +func (r *columnChunkValueReader) clear() { + if r.page != nil { + r.release(r.page) + r.page = nil + r.values = nil + } +} + +func (r *columnChunkValueReader) Reset() { + if r.pages != nil { + // Ignore errors because we are resetting the reader, if the error + // persists we will see it on the next read, and otherwise we can + // read back from the beginning. + r.pages.SeekToRow(0) + } + r.clear() +} + +func (r *columnChunkValueReader) Close() error { + var err error + if r.pages != nil { + err = r.pages.Close() + r.pages = nil + } + r.clear() + return err +} + +func (r *columnChunkValueReader) ReadValues(values []Value) (int, error) { + if r.pages == nil { + return 0, io.EOF + } + + for { + if r.values == nil { + p, err := r.pages.ReadPage() + if err != nil { + return 0, err + } + r.page = p + r.values = p.Values() + } + + n, err := r.values.ReadValues(values) + if n > 0 { + return n, nil + } + if err == nil { + return 0, io.ErrNoProgress + } + if err != io.EOF { + return 0, err + } + r.clear() + } +} + +func (r *columnChunkValueReader) SeekToRow(rowIndex int64) error { + if r.pages == nil { + return io.ErrClosedPipe + } + if err := r.pages.SeekToRow(rowIndex); err != nil { + return err + } + r.clear() + return nil +} + type pageAndValueWriter interface { PageWriter ValueWriter @@ -186,7 +294,7 @@ func readRowsFuncOfLeaf(columnIndex int, repetitionDepth byte) (int, readRowsFun for i := range rows { if col.offset == col.length { - n, err := col.values.ReadValues(buf) + n, err := col.reader.ReadValues(buf) col.offset = 0 col.length = int32(n) if n == 0 && err != nil { @@ -209,7 +317,7 @@ func readRowsFuncOfLeaf(columnIndex int, repetitionDepth byte) (int, readRowsFun buf := r.buffer(columnIndex) if col.offset == col.length { - n, err := col.values.ReadValues(buf) + n, err := col.reader.ReadValues(buf) col.offset = 0 col.length = int32(n) if n == 0 && err != nil { diff --git a/vendor/github.com/parquet-go/parquet-go/column_index.go b/vendor/github.com/parquet-go/parquet-go/column_index.go new file mode 100644 index 0000000000000..81e0c4f6cca5e --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_index.go @@ -0,0 +1,754 @@ +package parquet + +import ( + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding/plain" + "github.com/parquet-go/parquet-go/format" + "github.com/parquet-go/parquet-go/internal/unsafecast" +) + +type ColumnIndex interface { + // NumPages returns the number of paged in the column index. + NumPages() int + + // Returns the number of null values in the page at the given index. + NullCount(int) int64 + + // Tells whether the page at the given index contains null values only. + NullPage(int) bool + + // PageIndex return min/max bounds for the page at the given index in the + // column. + MinValue(int) Value + MaxValue(int) Value + + // IsAscending returns true if the column index min/max values are sorted + // in ascending order (based on the ordering rules of the column's logical + // type). + IsAscending() bool + + // IsDescending returns true if the column index min/max values are sorted + // in descending order (based on the ordering rules of the column's logical + // type). + IsDescending() bool +} + +// NewColumnIndex constructs a ColumnIndex instance from the given parquet +// format column index. The kind argument configures the type of values +func NewColumnIndex(kind Kind, index *format.ColumnIndex) ColumnIndex { + return &formatColumnIndex{ + kind: kind, + index: index, + } +} + +type formatColumnIndex struct { + kind Kind + index *format.ColumnIndex +} + +func (f *formatColumnIndex) NumPages() int { + return len(f.index.MinValues) +} + +func (f *formatColumnIndex) NullCount(i int) int64 { + if len(f.index.NullCounts) > 0 { + return f.index.NullCounts[i] + } + return 0 +} + +func (f *formatColumnIndex) NullPage(i int) bool { + return len(f.index.NullPages) > 0 && f.index.NullPages[i] +} + +func (f *formatColumnIndex) MinValue(i int) Value { + if f.NullPage(i) { + return Value{} + } + return f.kind.Value(f.index.MinValues[i]) +} + +func (f *formatColumnIndex) MaxValue(i int) Value { + if f.NullPage(i) { + return Value{} + } + return f.kind.Value(f.index.MaxValues[i]) +} + +func (f *formatColumnIndex) IsAscending() bool { + return f.index.BoundaryOrder == format.Ascending +} + +func (f *formatColumnIndex) IsDescending() bool { + return f.index.BoundaryOrder == format.Descending +} + +type FileColumnIndex struct { + index *format.ColumnIndex + kind Kind +} + +func (i *FileColumnIndex) NumPages() int { + return len(i.index.NullPages) +} + +func (i *FileColumnIndex) NullCount(j int) int64 { + if len(i.index.NullCounts) > 0 { + return i.index.NullCounts[j] + } + return 0 +} + +func (i *FileColumnIndex) NullPage(j int) bool { + return isNullPage(j, i.index) +} + +func (i *FileColumnIndex) MinValue(j int) Value { + if i.NullPage(j) { + return Value{} + } + return i.makeValue(i.index.MinValues[j]) +} + +func (i *FileColumnIndex) MaxValue(j int) Value { + if i.NullPage(j) { + return Value{} + } + return i.makeValue(i.index.MaxValues[j]) +} + +func (i *FileColumnIndex) IsAscending() bool { + return i.index.BoundaryOrder == format.Ascending +} + +func (i *FileColumnIndex) IsDescending() bool { + return i.index.BoundaryOrder == format.Descending +} + +func (i *FileColumnIndex) makeValue(b []byte) Value { + return i.kind.Value(b) +} + +func isNullPage(j int, index *format.ColumnIndex) bool { + return len(index.NullPages) > 0 && index.NullPages[j] +} + +type emptyColumnIndex struct{} + +func (emptyColumnIndex) NumPages() int { return 0 } +func (emptyColumnIndex) NullCount(int) int64 { return 0 } +func (emptyColumnIndex) NullPage(int) bool { return false } +func (emptyColumnIndex) MinValue(int) Value { return Value{} } +func (emptyColumnIndex) MaxValue(int) Value { return Value{} } +func (emptyColumnIndex) IsAscending() bool { return false } +func (emptyColumnIndex) IsDescending() bool { return false } + +type booleanColumnIndex struct{ page *booleanPage } + +func (i booleanColumnIndex) NumPages() int { return 1 } +func (i booleanColumnIndex) NullCount(int) int64 { return 0 } +func (i booleanColumnIndex) NullPage(int) bool { return false } +func (i booleanColumnIndex) MinValue(int) Value { return makeValueBoolean(i.page.min()) } +func (i booleanColumnIndex) MaxValue(int) Value { return makeValueBoolean(i.page.max()) } +func (i booleanColumnIndex) IsAscending() bool { return false } +func (i booleanColumnIndex) IsDescending() bool { return false } + +type int32ColumnIndex struct{ page *int32Page } + +func (i int32ColumnIndex) NumPages() int { return 1 } +func (i int32ColumnIndex) NullCount(int) int64 { return 0 } +func (i int32ColumnIndex) NullPage(int) bool { return false } +func (i int32ColumnIndex) MinValue(int) Value { return makeValueInt32(i.page.min()) } +func (i int32ColumnIndex) MaxValue(int) Value { return makeValueInt32(i.page.max()) } +func (i int32ColumnIndex) IsAscending() bool { return false } +func (i int32ColumnIndex) IsDescending() bool { return false } + +type int64ColumnIndex struct{ page *int64Page } + +func (i int64ColumnIndex) NumPages() int { return 1 } +func (i int64ColumnIndex) NullCount(int) int64 { return 0 } +func (i int64ColumnIndex) NullPage(int) bool { return false } +func (i int64ColumnIndex) MinValue(int) Value { return makeValueInt64(i.page.min()) } +func (i int64ColumnIndex) MaxValue(int) Value { return makeValueInt64(i.page.max()) } +func (i int64ColumnIndex) IsAscending() bool { return false } +func (i int64ColumnIndex) IsDescending() bool { return false } + +type int96ColumnIndex struct{ page *int96Page } + +func (i int96ColumnIndex) NumPages() int { return 1 } +func (i int96ColumnIndex) NullCount(int) int64 { return 0 } +func (i int96ColumnIndex) NullPage(int) bool { return false } +func (i int96ColumnIndex) MinValue(int) Value { return makeValueInt96(i.page.min()) } +func (i int96ColumnIndex) MaxValue(int) Value { return makeValueInt96(i.page.max()) } +func (i int96ColumnIndex) IsAscending() bool { return false } +func (i int96ColumnIndex) IsDescending() bool { return false } + +type floatColumnIndex struct{ page *floatPage } + +func (i floatColumnIndex) NumPages() int { return 1 } +func (i floatColumnIndex) NullCount(int) int64 { return 0 } +func (i floatColumnIndex) NullPage(int) bool { return false } +func (i floatColumnIndex) MinValue(int) Value { return makeValueFloat(i.page.min()) } +func (i floatColumnIndex) MaxValue(int) Value { return makeValueFloat(i.page.max()) } +func (i floatColumnIndex) IsAscending() bool { return false } +func (i floatColumnIndex) IsDescending() bool { return false } + +type doubleColumnIndex struct{ page *doublePage } + +func (i doubleColumnIndex) NumPages() int { return 1 } +func (i doubleColumnIndex) NullCount(int) int64 { return 0 } +func (i doubleColumnIndex) NullPage(int) bool { return false } +func (i doubleColumnIndex) MinValue(int) Value { return makeValueDouble(i.page.min()) } +func (i doubleColumnIndex) MaxValue(int) Value { return makeValueDouble(i.page.max()) } +func (i doubleColumnIndex) IsAscending() bool { return false } +func (i doubleColumnIndex) IsDescending() bool { return false } + +type byteArrayColumnIndex struct{ page *byteArrayPage } + +func (i byteArrayColumnIndex) NumPages() int { return 1 } +func (i byteArrayColumnIndex) NullCount(int) int64 { return 0 } +func (i byteArrayColumnIndex) NullPage(int) bool { return false } +func (i byteArrayColumnIndex) MinValue(int) Value { return makeValueBytes(ByteArray, i.page.min()) } +func (i byteArrayColumnIndex) MaxValue(int) Value { return makeValueBytes(ByteArray, i.page.max()) } +func (i byteArrayColumnIndex) IsAscending() bool { return false } +func (i byteArrayColumnIndex) IsDescending() bool { return false } + +type fixedLenByteArrayColumnIndex struct{ page *fixedLenByteArrayPage } + +func (i fixedLenByteArrayColumnIndex) NumPages() int { return 1 } +func (i fixedLenByteArrayColumnIndex) NullCount(int) int64 { return 0 } +func (i fixedLenByteArrayColumnIndex) NullPage(int) bool { return false } +func (i fixedLenByteArrayColumnIndex) MinValue(int) Value { + return makeValueBytes(FixedLenByteArray, i.page.min()) +} +func (i fixedLenByteArrayColumnIndex) MaxValue(int) Value { + return makeValueBytes(FixedLenByteArray, i.page.max()) +} +func (i fixedLenByteArrayColumnIndex) IsAscending() bool { return false } +func (i fixedLenByteArrayColumnIndex) IsDescending() bool { return false } + +type uint32ColumnIndex struct{ page *uint32Page } + +func (i uint32ColumnIndex) NumPages() int { return 1 } +func (i uint32ColumnIndex) NullCount(int) int64 { return 0 } +func (i uint32ColumnIndex) NullPage(int) bool { return false } +func (i uint32ColumnIndex) MinValue(int) Value { return makeValueUint32(i.page.min()) } +func (i uint32ColumnIndex) MaxValue(int) Value { return makeValueUint32(i.page.max()) } +func (i uint32ColumnIndex) IsAscending() bool { return false } +func (i uint32ColumnIndex) IsDescending() bool { return false } + +type uint64ColumnIndex struct{ page *uint64Page } + +func (i uint64ColumnIndex) NumPages() int { return 1 } +func (i uint64ColumnIndex) NullCount(int) int64 { return 0 } +func (i uint64ColumnIndex) NullPage(int) bool { return false } +func (i uint64ColumnIndex) MinValue(int) Value { return makeValueUint64(i.page.min()) } +func (i uint64ColumnIndex) MaxValue(int) Value { return makeValueUint64(i.page.max()) } +func (i uint64ColumnIndex) IsAscending() bool { return false } +func (i uint64ColumnIndex) IsDescending() bool { return false } + +type be128ColumnIndex struct{ page *be128Page } + +func (i be128ColumnIndex) NumPages() int { return 1 } +func (i be128ColumnIndex) NullCount(int) int64 { return 0 } +func (i be128ColumnIndex) NullPage(int) bool { return false } +func (i be128ColumnIndex) MinValue(int) Value { return makeValueBytes(FixedLenByteArray, i.page.min()) } +func (i be128ColumnIndex) MaxValue(int) Value { return makeValueBytes(FixedLenByteArray, i.page.max()) } +func (i be128ColumnIndex) IsAscending() bool { return false } +func (i be128ColumnIndex) IsDescending() bool { return false } + +// The ColumnIndexer interface is implemented by types that support generating +// parquet column indexes. +// +// The package does not export any types that implement this interface, programs +// must call NewColumnIndexer on a Type instance to construct column indexers. +type ColumnIndexer interface { + // Resets the column indexer state. + Reset() + + // Add a page to the column indexer. + IndexPage(numValues, numNulls int64, min, max Value) + + // Generates a format.ColumnIndex value from the current state of the + // column indexer. + // + // The returned value may reference internal buffers, in which case the + // values remain valid until the next call to IndexPage or Reset on the + // column indexer. + ColumnIndex() format.ColumnIndex +} + +type baseColumnIndexer struct { + nullPages []bool + nullCounts []int64 +} + +func (i *baseColumnIndexer) reset() { + i.nullPages = i.nullPages[:0] + i.nullCounts = i.nullCounts[:0] +} + +func (i *baseColumnIndexer) observe(numValues, numNulls int64) { + i.nullPages = append(i.nullPages, numValues == numNulls) + i.nullCounts = append(i.nullCounts, numNulls) +} + +func (i *baseColumnIndexer) columnIndex(minValues, maxValues [][]byte, minOrder, maxOrder int) format.ColumnIndex { + nullPages := make([]bool, len(i.nullPages)) + copy(nullPages, i.nullPages) + nullCounts := make([]int64, len(i.nullCounts)) + copy(nullCounts, i.nullCounts) + return format.ColumnIndex{ + NullPages: nullPages, + NullCounts: nullCounts, + MinValues: minValues, + MaxValues: maxValues, + BoundaryOrder: boundaryOrderOf(minOrder, maxOrder), + } +} + +type booleanColumnIndexer struct { + baseColumnIndexer + minValues []bool + maxValues []bool +} + +func newBooleanColumnIndexer() *booleanColumnIndexer { + return new(booleanColumnIndexer) +} + +func (i *booleanColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *booleanColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + i.minValues = append(i.minValues, min.boolean()) + i.maxValues = append(i.maxValues, max.boolean()) +} + +func (i *booleanColumnIndexer) ColumnIndex() format.ColumnIndex { + return i.columnIndex( + splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 1), + splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 1), + orderOfBool(i.minValues), + orderOfBool(i.maxValues), + ) +} + +type int32ColumnIndexer struct { + baseColumnIndexer + minValues []int32 + maxValues []int32 +} + +func newInt32ColumnIndexer() *int32ColumnIndexer { + return new(int32ColumnIndexer) +} + +func (i *int32ColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *int32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + i.minValues = append(i.minValues, min.int32()) + i.maxValues = append(i.maxValues, max.int32()) +} + +func (i *int32ColumnIndexer) ColumnIndex() format.ColumnIndex { + return i.columnIndex( + splitFixedLenByteArrays(columnIndexInt32Values(i.minValues), 4), + splitFixedLenByteArrays(columnIndexInt32Values(i.maxValues), 4), + orderOfInt32(i.minValues), + orderOfInt32(i.maxValues), + ) +} + +type int64ColumnIndexer struct { + baseColumnIndexer + minValues []int64 + maxValues []int64 +} + +func newInt64ColumnIndexer() *int64ColumnIndexer { + return new(int64ColumnIndexer) +} + +func (i *int64ColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *int64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + i.minValues = append(i.minValues, min.int64()) + i.maxValues = append(i.maxValues, max.int64()) +} + +func (i *int64ColumnIndexer) ColumnIndex() format.ColumnIndex { + return i.columnIndex( + splitFixedLenByteArrays(columnIndexInt64Values(i.minValues), 8), + splitFixedLenByteArrays(columnIndexInt64Values(i.maxValues), 8), + orderOfInt64(i.minValues), + orderOfInt64(i.maxValues), + ) +} + +type int96ColumnIndexer struct { + baseColumnIndexer + minValues []deprecated.Int96 + maxValues []deprecated.Int96 +} + +func newInt96ColumnIndexer() *int96ColumnIndexer { + return new(int96ColumnIndexer) +} + +func (i *int96ColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *int96ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + i.minValues = append(i.minValues, min.Int96()) + i.maxValues = append(i.maxValues, max.Int96()) +} + +func (i *int96ColumnIndexer) ColumnIndex() format.ColumnIndex { + return i.columnIndex( + splitFixedLenByteArrays(columnIndexInt96Values(i.minValues), 12), + splitFixedLenByteArrays(columnIndexInt96Values(i.maxValues), 12), + deprecated.OrderOfInt96(i.minValues), + deprecated.OrderOfInt96(i.maxValues), + ) +} + +type floatColumnIndexer struct { + baseColumnIndexer + minValues []float32 + maxValues []float32 +} + +func newFloatColumnIndexer() *floatColumnIndexer { + return new(floatColumnIndexer) +} + +func (i *floatColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *floatColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + i.minValues = append(i.minValues, min.float()) + i.maxValues = append(i.maxValues, max.float()) +} + +func (i *floatColumnIndexer) ColumnIndex() format.ColumnIndex { + return i.columnIndex( + splitFixedLenByteArrays(columnIndexFloatValues(i.minValues), 4), + splitFixedLenByteArrays(columnIndexFloatValues(i.maxValues), 4), + orderOfFloat32(i.minValues), + orderOfFloat32(i.maxValues), + ) +} + +type doubleColumnIndexer struct { + baseColumnIndexer + minValues []float64 + maxValues []float64 +} + +func newDoubleColumnIndexer() *doubleColumnIndexer { + return new(doubleColumnIndexer) +} + +func (i *doubleColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *doubleColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + i.minValues = append(i.minValues, min.double()) + i.maxValues = append(i.maxValues, max.double()) +} + +func (i *doubleColumnIndexer) ColumnIndex() format.ColumnIndex { + return i.columnIndex( + splitFixedLenByteArrays(columnIndexDoubleValues(i.minValues), 8), + splitFixedLenByteArrays(columnIndexDoubleValues(i.maxValues), 8), + orderOfFloat64(i.minValues), + orderOfFloat64(i.maxValues), + ) +} + +type byteArrayColumnIndexer struct { + baseColumnIndexer + sizeLimit int + minValues []byte + maxValues []byte +} + +func newByteArrayColumnIndexer(sizeLimit int) *byteArrayColumnIndexer { + return &byteArrayColumnIndexer{sizeLimit: sizeLimit} +} + +func (i *byteArrayColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *byteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + i.minValues = plain.AppendByteArray(i.minValues, min.byteArray()) + i.maxValues = plain.AppendByteArray(i.maxValues, max.byteArray()) +} + +func (i *byteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { + minValues := splitByteArrays(i.minValues) + maxValues := splitByteArrays(i.maxValues) + if sizeLimit := i.sizeLimit; sizeLimit > 0 { + for i, v := range minValues { + minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) + } + for i, v := range maxValues { + maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) + } + } + return i.columnIndex( + minValues, + maxValues, + orderOfBytes(minValues), + orderOfBytes(maxValues), + ) +} + +type fixedLenByteArrayColumnIndexer struct { + baseColumnIndexer + size int + sizeLimit int + minValues []byte + maxValues []byte +} + +func newFixedLenByteArrayColumnIndexer(size, sizeLimit int) *fixedLenByteArrayColumnIndexer { + return &fixedLenByteArrayColumnIndexer{ + size: size, + sizeLimit: sizeLimit, + } +} + +func (i *fixedLenByteArrayColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *fixedLenByteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + i.minValues = append(i.minValues, min.byteArray()...) + i.maxValues = append(i.maxValues, max.byteArray()...) +} + +func (i *fixedLenByteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { + minValues := splitFixedLenByteArrays(i.minValues, i.size) + maxValues := splitFixedLenByteArrays(i.maxValues, i.size) + if sizeLimit := i.sizeLimit; sizeLimit > 0 { + for i, v := range minValues { + minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) + } + for i, v := range maxValues { + maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) + } + } + return i.columnIndex( + minValues, + maxValues, + orderOfBytes(minValues), + orderOfBytes(maxValues), + ) +} + +type uint32ColumnIndexer struct { + baseColumnIndexer + minValues []uint32 + maxValues []uint32 +} + +func newUint32ColumnIndexer() *uint32ColumnIndexer { + return new(uint32ColumnIndexer) +} + +func (i *uint32ColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *uint32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + i.minValues = append(i.minValues, min.uint32()) + i.maxValues = append(i.maxValues, max.uint32()) +} + +func (i *uint32ColumnIndexer) ColumnIndex() format.ColumnIndex { + return i.columnIndex( + splitFixedLenByteArrays(columnIndexUint32Values(i.minValues), 4), + splitFixedLenByteArrays(columnIndexUint32Values(i.maxValues), 4), + orderOfUint32(i.minValues), + orderOfUint32(i.maxValues), + ) +} + +type uint64ColumnIndexer struct { + baseColumnIndexer + minValues []uint64 + maxValues []uint64 +} + +func newUint64ColumnIndexer() *uint64ColumnIndexer { + return new(uint64ColumnIndexer) +} + +func (i *uint64ColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *uint64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + i.minValues = append(i.minValues, min.uint64()) + i.maxValues = append(i.maxValues, max.uint64()) +} + +func (i *uint64ColumnIndexer) ColumnIndex() format.ColumnIndex { + return i.columnIndex( + splitFixedLenByteArrays(columnIndexUint64Values(i.minValues), 8), + splitFixedLenByteArrays(columnIndexUint64Values(i.maxValues), 8), + orderOfUint64(i.minValues), + orderOfUint64(i.maxValues), + ) +} + +type be128ColumnIndexer struct { + baseColumnIndexer + minValues [][16]byte + maxValues [][16]byte +} + +func newBE128ColumnIndexer() *be128ColumnIndexer { + return new(be128ColumnIndexer) +} + +func (i *be128ColumnIndexer) Reset() { + i.reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] +} + +func (i *be128ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { + i.observe(numValues, numNulls) + if !min.IsNull() { + i.minValues = append(i.minValues, *(*[16]byte)(min.byteArray())) + } + if !max.IsNull() { + i.maxValues = append(i.maxValues, *(*[16]byte)(max.byteArray())) + } +} + +func (i *be128ColumnIndexer) ColumnIndex() format.ColumnIndex { + minValues := splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 16) + maxValues := splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 16) + return i.columnIndex( + minValues, + maxValues, + orderOfBytes(minValues), + orderOfBytes(maxValues), + ) +} + +func truncateLargeMinByteArrayValue(value []byte, sizeLimit int) []byte { + if len(value) > sizeLimit { + value = value[:sizeLimit] + } + return value +} + +// truncateLargeMaxByteArrayValue truncates the given byte array to the given size limit. +// If the given byte array is truncated, it is incremented by 1 in place. +func truncateLargeMaxByteArrayValue(value []byte, sizeLimit int) []byte { + if len(value) > sizeLimit { + value = value[:sizeLimit] + incrementByteArrayInplace(value) + } + return value +} + +// incrementByteArray increments the given byte array by 1. +// Reference: https://github.com/apache/parquet-java/blob/master/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java#L124 +func incrementByteArrayInplace(value []byte) { + for i := len(value) - 1; i >= 0; i-- { + value[i]++ + if value[i] != 0 { // Did not overflow: 0xFF -> 0x00 + return + } + } + // Fully overflowed, so restore all to 0xFF + for i := range value { + value[i] = 0xFF + } +} + +func splitByteArrays(data []byte) [][]byte { + length := 0 + plain.RangeByteArray(data, func([]byte) error { + length++ + return nil + }) + buffer := make([]byte, 0, len(data)-(4*length)) + values := make([][]byte, 0, length) + plain.RangeByteArray(data, func(value []byte) error { + offset := len(buffer) + buffer = append(buffer, value...) + values = append(values, buffer[offset:]) + return nil + }) + return values +} + +func splitFixedLenByteArrays(data []byte, size int) [][]byte { + data = copyBytes(data) + values := make([][]byte, len(data)/size) + for i := range values { + j := (i + 0) * size + k := (i + 1) * size + values[i] = data[j:k:k] + } + return values +} + +func boundaryOrderOf(minOrder, maxOrder int) format.BoundaryOrder { + if minOrder == maxOrder { + switch { + case minOrder > 0: + return format.Ascending + case minOrder < 0: + return format.Descending + } + } + return format.Unordered +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_index_be.go b/vendor/github.com/parquet-go/parquet-go/column_index_be.go index f3ea2e7bdfb3b..ace82ae3207e4 100644 --- a/vendor/github.com/parquet-go/parquet-go/column_index_be.go +++ b/vendor/github.com/parquet-go/parquet-go/column_index_be.go @@ -6,849 +6,79 @@ package parquet import ( "encoding/binary" - "github.com/parquet-go/parquet-go/deprecated" - "github.com/parquet-go/parquet-go/encoding/plain" - "github.com/parquet-go/parquet-go/format" - "github.com/parquet-go/parquet-go/internal/unsafecast" "math" -) - -type ColumnIndex interface { - // NumPages returns the number of paged in the column index. - NumPages() int - - // Returns the number of null values in the page at the given index. - NullCount(int) int64 - - // Tells whether the page at the given index contains null values only. - NullPage(int) bool - - // PageIndex return min/max bounds for the page at the given index in the - // column. - MinValue(int) Value - MaxValue(int) Value - - // IsAscending returns true if the column index min/max values are sorted - // in ascending order (based on the ordering rules of the column's logical - // type). - IsAscending() bool - - // IsDescending returns true if the column index min/max values are sorted - // in descending order (based on the ordering rules of the column's logical - // type). - IsDescending() bool -} - -// NewColumnIndex constructs a ColumnIndex instance from the given parquet -// format column index. The kind argument configures the type of values -func NewColumnIndex(kind Kind, index *format.ColumnIndex) ColumnIndex { - return &formatColumnIndex{ - kind: kind, - index: index, - } -} - -type formatColumnIndex struct { - kind Kind - index *format.ColumnIndex -} - -func (f *formatColumnIndex) NumPages() int { - return len(f.index.MinValues) -} - -func (f *formatColumnIndex) NullCount(i int) int64 { - if len(f.index.NullCounts) > 0 { - return f.index.NullCounts[i] - } - return 0 -} - -func (f *formatColumnIndex) NullPage(i int) bool { - return len(f.index.NullPages) > 0 && f.index.NullPages[i] -} - -func (f *formatColumnIndex) MinValue(i int) Value { - if f.NullPage(i) { - return Value{} - } - return f.kind.Value(f.index.MinValues[i]) -} - -func (f *formatColumnIndex) MaxValue(i int) Value { - if f.NullPage(i) { - return Value{} - } - return f.kind.Value(f.index.MaxValues[i]) -} - -func (f *formatColumnIndex) IsAscending() bool { - return f.index.BoundaryOrder == format.Ascending -} - -func (f *formatColumnIndex) IsDescending() bool { - return f.index.BoundaryOrder == format.Descending -} - -type fileColumnIndex struct{ chunk *fileColumnChunk } - -func (i fileColumnIndex) NumPages() int { - return len(i.columnIndex().NullPages) -} - -func (i fileColumnIndex) NullCount(j int) int64 { - index := i.columnIndex() - if len(index.NullCounts) > 0 { - return index.NullCounts[j] - } - return 0 -} - -func (i fileColumnIndex) NullPage(j int) bool { - return isNullPage(j, i.columnIndex()) -} - -func (i fileColumnIndex) MinValue(j int) Value { - index := i.columnIndex() - if isNullPage(j, index) { - return Value{} - } - return i.makeValue(index.MinValues[j]) -} - -func (i fileColumnIndex) MaxValue(j int) Value { - index := i.columnIndex() - if isNullPage(j, index) { - return Value{} - } - return i.makeValue(index.MaxValues[j]) -} - -func (i fileColumnIndex) IsAscending() bool { - return i.columnIndex().BoundaryOrder == format.Ascending -} - -func (i fileColumnIndex) IsDescending() bool { - return i.columnIndex().BoundaryOrder == format.Descending -} - -func (i *fileColumnIndex) makeValue(b []byte) Value { - return i.chunk.column.typ.Kind().Value(b) -} - -func (i fileColumnIndex) columnIndex() *format.ColumnIndex { return i.chunk.columnIndex.Load() } -func isNullPage(j int, index *format.ColumnIndex) bool { - return len(index.NullPages) > 0 && index.NullPages[j] -} - -type emptyColumnIndex struct{} - -func (emptyColumnIndex) NumPages() int { return 0 } -func (emptyColumnIndex) NullCount(int) int64 { return 0 } -func (emptyColumnIndex) NullPage(int) bool { return false } -func (emptyColumnIndex) MinValue(int) Value { return Value{} } -func (emptyColumnIndex) MaxValue(int) Value { return Value{} } -func (emptyColumnIndex) IsAscending() bool { return false } -func (emptyColumnIndex) IsDescending() bool { return false } - -type booleanColumnIndex struct{ page *booleanPage } - -func (i booleanColumnIndex) NumPages() int { return 1 } -func (i booleanColumnIndex) NullCount(int) int64 { return 0 } -func (i booleanColumnIndex) NullPage(int) bool { return false } -func (i booleanColumnIndex) MinValue(int) Value { return makeValueBoolean(i.page.min()) } -func (i booleanColumnIndex) MaxValue(int) Value { return makeValueBoolean(i.page.max()) } -func (i booleanColumnIndex) IsAscending() bool { return false } -func (i booleanColumnIndex) IsDescending() bool { return false } - -type int32ColumnIndex struct{ page *int32Page } - -func (i int32ColumnIndex) NumPages() int { return 1 } -func (i int32ColumnIndex) NullCount(int) int64 { return 0 } -func (i int32ColumnIndex) NullPage(int) bool { return false } -func (i int32ColumnIndex) MinValue(int) Value { return makeValueInt32(i.page.min()) } -func (i int32ColumnIndex) MaxValue(int) Value { return makeValueInt32(i.page.max()) } -func (i int32ColumnIndex) IsAscending() bool { return false } -func (i int32ColumnIndex) IsDescending() bool { return false } -type int64ColumnIndex struct{ page *int64Page } - -func (i int64ColumnIndex) NumPages() int { return 1 } -func (i int64ColumnIndex) NullCount(int) int64 { return 0 } -func (i int64ColumnIndex) NullPage(int) bool { return false } -func (i int64ColumnIndex) MinValue(int) Value { return makeValueInt64(i.page.min()) } -func (i int64ColumnIndex) MaxValue(int) Value { return makeValueInt64(i.page.max()) } -func (i int64ColumnIndex) IsAscending() bool { return false } -func (i int64ColumnIndex) IsDescending() bool { return false } - -type int96ColumnIndex struct{ page *int96Page } - -func (i int96ColumnIndex) NumPages() int { return 1 } -func (i int96ColumnIndex) NullCount(int) int64 { return 0 } -func (i int96ColumnIndex) NullPage(int) bool { return false } -func (i int96ColumnIndex) MinValue(int) Value { return makeValueInt96(i.page.min()) } -func (i int96ColumnIndex) MaxValue(int) Value { return makeValueInt96(i.page.max()) } -func (i int96ColumnIndex) IsAscending() bool { return false } -func (i int96ColumnIndex) IsDescending() bool { return false } - -type floatColumnIndex struct{ page *floatPage } - -func (i floatColumnIndex) NumPages() int { return 1 } -func (i floatColumnIndex) NullCount(int) int64 { return 0 } -func (i floatColumnIndex) NullPage(int) bool { return false } -func (i floatColumnIndex) MinValue(int) Value { return makeValueFloat(i.page.min()) } -func (i floatColumnIndex) MaxValue(int) Value { return makeValueFloat(i.page.max()) } -func (i floatColumnIndex) IsAscending() bool { return false } -func (i floatColumnIndex) IsDescending() bool { return false } - -type doubleColumnIndex struct{ page *doublePage } - -func (i doubleColumnIndex) NumPages() int { return 1 } -func (i doubleColumnIndex) NullCount(int) int64 { return 0 } -func (i doubleColumnIndex) NullPage(int) bool { return false } -func (i doubleColumnIndex) MinValue(int) Value { return makeValueDouble(i.page.min()) } -func (i doubleColumnIndex) MaxValue(int) Value { return makeValueDouble(i.page.max()) } -func (i doubleColumnIndex) IsAscending() bool { return false } -func (i doubleColumnIndex) IsDescending() bool { return false } - -type byteArrayColumnIndex struct{ page *byteArrayPage } - -func (i byteArrayColumnIndex) NumPages() int { return 1 } -func (i byteArrayColumnIndex) NullCount(int) int64 { return 0 } -func (i byteArrayColumnIndex) NullPage(int) bool { return false } -func (i byteArrayColumnIndex) MinValue(int) Value { return makeValueBytes(ByteArray, i.page.min()) } -func (i byteArrayColumnIndex) MaxValue(int) Value { return makeValueBytes(ByteArray, i.page.max()) } -func (i byteArrayColumnIndex) IsAscending() bool { return false } -func (i byteArrayColumnIndex) IsDescending() bool { return false } - -type fixedLenByteArrayColumnIndex struct{ page *fixedLenByteArrayPage } - -func (i fixedLenByteArrayColumnIndex) NumPages() int { return 1 } -func (i fixedLenByteArrayColumnIndex) NullCount(int) int64 { return 0 } -func (i fixedLenByteArrayColumnIndex) NullPage(int) bool { return false } -func (i fixedLenByteArrayColumnIndex) MinValue(int) Value { - return makeValueBytes(FixedLenByteArray, i.page.min()) -} -func (i fixedLenByteArrayColumnIndex) MaxValue(int) Value { - return makeValueBytes(FixedLenByteArray, i.page.max()) -} -func (i fixedLenByteArrayColumnIndex) IsAscending() bool { return false } -func (i fixedLenByteArrayColumnIndex) IsDescending() bool { return false } - -type uint32ColumnIndex struct{ page *uint32Page } - -func (i uint32ColumnIndex) NumPages() int { return 1 } -func (i uint32ColumnIndex) NullCount(int) int64 { return 0 } -func (i uint32ColumnIndex) NullPage(int) bool { return false } -func (i uint32ColumnIndex) MinValue(int) Value { return makeValueUint32(i.page.min()) } -func (i uint32ColumnIndex) MaxValue(int) Value { return makeValueUint32(i.page.max()) } -func (i uint32ColumnIndex) IsAscending() bool { return false } -func (i uint32ColumnIndex) IsDescending() bool { return false } - -type uint64ColumnIndex struct{ page *uint64Page } - -func (i uint64ColumnIndex) NumPages() int { return 1 } -func (i uint64ColumnIndex) NullCount(int) int64 { return 0 } -func (i uint64ColumnIndex) NullPage(int) bool { return false } -func (i uint64ColumnIndex) MinValue(int) Value { return makeValueUint64(i.page.min()) } -func (i uint64ColumnIndex) MaxValue(int) Value { return makeValueUint64(i.page.max()) } -func (i uint64ColumnIndex) IsAscending() bool { return false } -func (i uint64ColumnIndex) IsDescending() bool { return false } - -type be128ColumnIndex struct{ page *be128Page } - -func (i be128ColumnIndex) NumPages() int { return 1 } -func (i be128ColumnIndex) NullCount(int) int64 { return 0 } -func (i be128ColumnIndex) NullPage(int) bool { return false } -func (i be128ColumnIndex) MinValue(int) Value { return makeValueBytes(FixedLenByteArray, i.page.min()) } -func (i be128ColumnIndex) MaxValue(int) Value { return makeValueBytes(FixedLenByteArray, i.page.max()) } -func (i be128ColumnIndex) IsAscending() bool { return false } -func (i be128ColumnIndex) IsDescending() bool { return false } - -// The ColumnIndexer interface is implemented by types that support generating -// parquet column indexes. -// -// The package does not export any types that implement this interface, programs -// must call NewColumnIndexer on a Type instance to construct column indexers. -type ColumnIndexer interface { - // Resets the column indexer state. - Reset() - - // Add a page to the column indexer. - IndexPage(numValues, numNulls int64, min, max Value) - - // Generates a format.ColumnIndex value from the current state of the - // column indexer. - // - // The returned value may reference internal buffers, in which case the - // values remain valid until the next call to IndexPage or Reset on the - // column indexer. - ColumnIndex() format.ColumnIndex -} - -type baseColumnIndexer struct { - nullPages []bool - nullCounts []int64 -} - -func (i *baseColumnIndexer) reset() { - i.nullPages = i.nullPages[:0] - i.nullCounts = i.nullCounts[:0] -} - -func (i *baseColumnIndexer) observe(numValues, numNulls int64) { - i.nullPages = append(i.nullPages, numValues == numNulls) - i.nullCounts = append(i.nullCounts, numNulls) -} - -func (i *baseColumnIndexer) columnIndex(minValues, maxValues [][]byte, minOrder, maxOrder int) format.ColumnIndex { - nullPages := make([]bool, len(i.nullPages)) - copy(nullPages, i.nullPages) - nullCounts := make([]int64, len(i.nullCounts)) - copy(nullCounts, i.nullCounts) - return format.ColumnIndex{ - NullPages: nullPages, - NullCounts: nullCounts, - MinValues: minValues, - MaxValues: maxValues, - BoundaryOrder: boundaryOrderOf(minOrder, maxOrder), - } -} - -type booleanColumnIndexer struct { - baseColumnIndexer - minValues []bool - maxValues []bool -} - -func newBooleanColumnIndexer() *booleanColumnIndexer { - return new(booleanColumnIndexer) -} - -func (i *booleanColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *booleanColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.boolean()) - i.maxValues = append(i.maxValues, max.boolean()) -} - -func (i *booleanColumnIndexer) ColumnIndex() format.ColumnIndex { - return i.columnIndex( - splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 1), - splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 1), - orderOfBool(i.minValues), - orderOfBool(i.maxValues), - ) -} - -type int32ColumnIndexer struct { - baseColumnIndexer - minValues []int32 - maxValues []int32 -} - -func newInt32ColumnIndexer() *int32ColumnIndexer { - return new(int32ColumnIndexer) -} - -func (i *int32ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *int32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.int32()) - i.maxValues = append(i.maxValues, max.int32()) -} + "github.com/parquet-go/parquet-go/deprecated" +) -func reverseInt32MinMaxValues(mLen int, mVal []int32) []byte { - buf := make([]byte, mLen*4) +func columnIndexInt32Values(values []int32) []byte { + buf := make([]byte, len(values)*4) idx := 0 - for k := range mLen { - binary.LittleEndian.PutUint32(buf[idx:(4+idx)], uint32(mVal[k])) + for k := range len(values) { + binary.LittleEndian.PutUint32(buf[idx:(4+idx)], uint32(values[k])) idx += 4 } return buf } -func (i *int32ColumnIndexer) ColumnIndex() format.ColumnIndex { - byteMin := reverseInt32MinMaxValues(len(i.minValues), i.minValues) - byteMax := reverseInt32MinMaxValues(len(i.maxValues), i.maxValues) - - return i.columnIndex( - splitFixedLenByteArrays(byteMin, 4), - splitFixedLenByteArrays(byteMax, 4), - orderOfInt32(i.minValues), - orderOfInt32(i.maxValues), - ) -} - -type int64ColumnIndexer struct { - baseColumnIndexer - minValues []int64 - maxValues []int64 -} - -func newInt64ColumnIndexer() *int64ColumnIndexer { - return new(int64ColumnIndexer) -} - -func (i *int64ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *int64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.int64()) - i.maxValues = append(i.maxValues, max.int64()) -} - -func reverseInt64MinMaxValues(mLen int, mVal []int64) []byte { - buf := make([]byte, mLen*8) +func columnIndexInt64Values(values []int64) []byte { + buf := make([]byte, len(values)*8) idx := 0 - for k := range mLen { - binary.LittleEndian.PutUint64(buf[idx:(8+idx)], uint64(mVal[k])) + for k := range len(values) { + binary.LittleEndian.PutUint64(buf[idx:(8+idx)], uint64(values[k])) idx += 8 } return buf } -func (i *int64ColumnIndexer) ColumnIndex() format.ColumnIndex { - byteMin := reverseInt64MinMaxValues(len(i.minValues), i.minValues) - byteMax := reverseInt64MinMaxValues(len(i.maxValues), i.maxValues) - - return i.columnIndex( - splitFixedLenByteArrays(byteMin, 8), - splitFixedLenByteArrays(byteMax, 8), - orderOfInt64(i.minValues), - orderOfInt64(i.maxValues), - ) -} - -type int96ColumnIndexer struct { - baseColumnIndexer - minValues []deprecated.Int96 - maxValues []deprecated.Int96 -} - -func newInt96ColumnIndexer() *int96ColumnIndexer { - return new(int96ColumnIndexer) -} - -func (i *int96ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *int96ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.Int96()) - i.maxValues = append(i.maxValues, max.Int96()) -} - -func reverseInt96MinMaxValues(mLen int, mVal []deprecated.Int96) []byte { - buf := make([]byte, mLen*12) +func columnIndexInt96Values(values []deprecated.Int96) []byte { + buf := make([]byte, len(values)*12) idx := 0 - for k := range mLen { - binary.LittleEndian.PutUint32(buf[idx:(4+idx)], uint32(mVal[k][0])) - binary.LittleEndian.PutUint32(buf[(4+idx):(8+idx)], uint32(mVal[k][1])) - binary.LittleEndian.PutUint32(buf[(8+idx):(12+idx)], uint32(mVal[k][2])) + for k := range len(values) { + binary.LittleEndian.PutUint32(buf[idx:(4+idx)], uint32(values[k][0])) + binary.LittleEndian.PutUint32(buf[(4+idx):(8+idx)], uint32(values[k][1])) + binary.LittleEndian.PutUint32(buf[(8+idx):(12+idx)], uint32(values[k][2])) idx += 12 } return buf } -func (i *int96ColumnIndexer) ColumnIndex() format.ColumnIndex { - byteMin := reverseInt96MinMaxValues(len(i.minValues), i.minValues) - byteMax := reverseInt96MinMaxValues(len(i.maxValues), i.maxValues) - - return i.columnIndex( - splitFixedLenByteArrays(byteMin, 12), - splitFixedLenByteArrays(byteMax, 12), - deprecated.OrderOfInt96(i.minValues), - deprecated.OrderOfInt96(i.maxValues), - ) -} - -type floatColumnIndexer struct { - baseColumnIndexer - minValues []float32 - maxValues []float32 -} - -func newFloatColumnIndexer() *floatColumnIndexer { - return new(floatColumnIndexer) -} - -func (i *floatColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *floatColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.float()) - i.maxValues = append(i.maxValues, max.float()) -} - -func reverseFloatMinMaxValues(mLen int, mVal []float32) []byte { - buf := make([]byte, mLen*4) +func columnIndexFloatValues(values []float32) []byte { + buf := make([]byte, len(values)*4) idx := 0 - for k := range mLen { - binary.LittleEndian.PutUint32(buf[idx:(4+idx)], math.Float32bits(mVal[k])) + for k := range len(values) { + binary.LittleEndian.PutUint32(buf[idx:(4+idx)], math.Float32bits(values[k])) idx += 4 } return buf } -func (i *floatColumnIndexer) ColumnIndex() format.ColumnIndex { - byteMin := reverseFloatMinMaxValues(len(i.minValues), i.minValues) - byteMax := reverseFloatMinMaxValues(len(i.maxValues), i.maxValues) - - return i.columnIndex( - splitFixedLenByteArrays(byteMin, 4), - splitFixedLenByteArrays(byteMax, 4), - orderOfFloat32(i.minValues), - orderOfFloat32(i.maxValues), - ) -} - -type doubleColumnIndexer struct { - baseColumnIndexer - minValues []float64 - maxValues []float64 -} - -func newDoubleColumnIndexer() *doubleColumnIndexer { - return new(doubleColumnIndexer) -} - -func (i *doubleColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *doubleColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.double()) - i.maxValues = append(i.maxValues, max.double()) -} - -func reverseDoubleMinMaxValues(mLen int, mVal []float64) []byte { - buf := make([]byte, mLen*8) +func columnIndexDoubleValues(values []float64) []byte { + buf := make([]byte, len(values)*8) idx := 0 - for k := range mLen { - binary.LittleEndian.PutUint64(buf[idx:(8+idx)], math.Float64bits(mVal[k])) + for k := range len(values) { + binary.LittleEndian.PutUint64(buf[idx:(8+idx)], math.Float64bits(values[k])) idx += 8 } return buf } -func (i *doubleColumnIndexer) ColumnIndex() format.ColumnIndex { - byteMin := reverseDoubleMinMaxValues(len(i.minValues), i.minValues) - byteMax := reverseDoubleMinMaxValues(len(i.maxValues), i.maxValues) - - return i.columnIndex( - splitFixedLenByteArrays(byteMin, 8), - splitFixedLenByteArrays(byteMax, 8), - orderOfFloat64(i.minValues), - orderOfFloat64(i.maxValues), - ) -} - -type byteArrayColumnIndexer struct { - baseColumnIndexer - sizeLimit int - minValues []byte - maxValues []byte -} - -func newByteArrayColumnIndexer(sizeLimit int) *byteArrayColumnIndexer { - return &byteArrayColumnIndexer{sizeLimit: sizeLimit} -} - -func (i *byteArrayColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *byteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = plain.AppendByteArray(i.minValues, min.byteArray()) - i.maxValues = plain.AppendByteArray(i.maxValues, max.byteArray()) -} - -func (i *byteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { - minValues := splitByteArrays(i.minValues) - maxValues := splitByteArrays(i.maxValues) - if sizeLimit := i.sizeLimit; sizeLimit > 0 { - for i, v := range minValues { - minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) - } - for i, v := range maxValues { - maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) - } - } - return i.columnIndex( - minValues, - maxValues, - orderOfBytes(minValues), - orderOfBytes(maxValues), - ) -} - -type fixedLenByteArrayColumnIndexer struct { - baseColumnIndexer - size int - sizeLimit int - minValues []byte - maxValues []byte -} - -func newFixedLenByteArrayColumnIndexer(size, sizeLimit int) *fixedLenByteArrayColumnIndexer { - return &fixedLenByteArrayColumnIndexer{ - size: size, - sizeLimit: sizeLimit, - } -} - -func (i *fixedLenByteArrayColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *fixedLenByteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.byteArray()...) - i.maxValues = append(i.maxValues, max.byteArray()...) -} - -func (i *fixedLenByteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { - minValues := splitFixedLenByteArrays(i.minValues, i.size) - maxValues := splitFixedLenByteArrays(i.maxValues, i.size) - if sizeLimit := i.sizeLimit; sizeLimit > 0 { - for i, v := range minValues { - minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) - } - for i, v := range maxValues { - maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) - } - } - return i.columnIndex( - minValues, - maxValues, - orderOfBytes(minValues), - orderOfBytes(maxValues), - ) -} - -type uint32ColumnIndexer struct { - baseColumnIndexer - minValues []uint32 - maxValues []uint32 -} - -func newUint32ColumnIndexer() *uint32ColumnIndexer { - return new(uint32ColumnIndexer) -} - -func (i *uint32ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *uint32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.uint32()) - i.maxValues = append(i.maxValues, max.uint32()) -} - -func reverseUint32MinMaxValues(mLen int, mVal []uint32) []byte { - buf := make([]byte, mLen*4) +func columnIndexUint32Values(values []uint32) []byte { + buf := make([]byte, len(values)*4) idx := 0 - for k := range mLen { - binary.LittleEndian.PutUint32(buf[idx:(4+idx)], mVal[k]) + for k := range len(values) { + binary.LittleEndian.PutUint32(buf[idx:(4+idx)], values[k]) idx += 4 } return buf } -func (i *uint32ColumnIndexer) ColumnIndex() format.ColumnIndex { - byteMin := reverseUint32MinMaxValues(len(i.minValues), i.minValues) - byteMax := reverseUint32MinMaxValues(len(i.maxValues), i.maxValues) - - return i.columnIndex( - splitFixedLenByteArrays(byteMin, 4), - splitFixedLenByteArrays(byteMax, 4), - orderOfUint32(i.minValues), - orderOfUint32(i.maxValues), - ) -} - -type uint64ColumnIndexer struct { - baseColumnIndexer - minValues []uint64 - maxValues []uint64 -} - -func newUint64ColumnIndexer() *uint64ColumnIndexer { - return new(uint64ColumnIndexer) -} - -func (i *uint64ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *uint64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.uint64()) - i.maxValues = append(i.maxValues, max.uint64()) -} - -func reverseUint64MinMaxValues(mLen int, mVal []uint64) []byte { - buf := make([]byte, mLen*8) +func columnIndexUint64Values(values []uint64) []byte { + buf := make([]byte, len(values)*8) idx := 0 - for k := range mLen { - binary.LittleEndian.PutUint64(buf[idx:(8+idx)], mVal[k]) + for k := range len(values) { + binary.LittleEndian.PutUint64(buf[idx:(8+idx)], values[k]) idx += 8 } return buf } - -func (i *uint64ColumnIndexer) ColumnIndex() format.ColumnIndex { - byteMin := reverseUint64MinMaxValues(len(i.minValues), i.minValues) - byteMax := reverseUint64MinMaxValues(len(i.maxValues), i.maxValues) - - return i.columnIndex( - splitFixedLenByteArrays(byteMin, 8), - splitFixedLenByteArrays(byteMax, 8), - orderOfUint64(i.minValues), - orderOfUint64(i.maxValues), - ) -} - -type be128ColumnIndexer struct { - baseColumnIndexer - minValues [][16]byte - maxValues [][16]byte -} - -func newBE128ColumnIndexer() *be128ColumnIndexer { - return new(be128ColumnIndexer) -} - -func (i *be128ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *be128ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - if !min.IsNull() { - i.minValues = append(i.minValues, *(*[16]byte)(min.byteArray())) - } - if !max.IsNull() { - i.maxValues = append(i.maxValues, *(*[16]byte)(max.byteArray())) - } -} - -func (i *be128ColumnIndexer) ColumnIndex() format.ColumnIndex { - minValues := splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 16) - maxValues := splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 16) - return i.columnIndex( - minValues, - maxValues, - orderOfBytes(minValues), - orderOfBytes(maxValues), - ) -} - -func truncateLargeMinByteArrayValue(value []byte, sizeLimit int) []byte { - if len(value) > sizeLimit { - value = value[:sizeLimit] - } - return value -} - -// truncateLargeMaxByteArrayValue truncates the given byte array to the given size limit. -// If the given byte array is truncated, it is incremented by 1 in place. -func truncateLargeMaxByteArrayValue(value []byte, sizeLimit int) []byte { - if len(value) > sizeLimit { - value = value[:sizeLimit] - incrementByteArrayInplace(value) - } - return value -} - -// incrementByteArray increments the given byte array by 1. -// Reference: https://github.com/apache/parquet-java/blob/master/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java#L124 -func incrementByteArrayInplace(value []byte) { - for i := len(value) - 1; i >= 0; i-- { - value[i]++ - if value[i] != 0 { // Did not overflow: 0xFF -> 0x00 - return - } - } - // Fully overflowed, so restore all to 0xFF - for i := range value { - value[i] = 0xFF - } -} - -func splitByteArrays(data []byte) [][]byte { - length := 0 - plain.RangeByteArray(data, func([]byte) error { - length++ - return nil - }) - buffer := make([]byte, 0, len(data)-(4*length)) - values := make([][]byte, 0, length) - plain.RangeByteArray(data, func(value []byte) error { - offset := len(buffer) - buffer = append(buffer, value...) - values = append(values, buffer[offset:]) - return nil - }) - return values -} - -func splitFixedLenByteArrays(data []byte, size int) [][]byte { - data = copyBytes(data) - values := make([][]byte, len(data)/size) - for i := range values { - j := (i + 0) * size - k := (i + 1) * size - values[i] = data[j:k:k] - } - return values -} - -func boundaryOrderOf(minOrder, maxOrder int) format.BoundaryOrder { - if minOrder == maxOrder { - switch { - case minOrder > 0: - return format.Ascending - case minOrder < 0: - return format.Descending - } - } - return format.Unordered -} diff --git a/vendor/github.com/parquet-go/parquet-go/column_index_le.go b/vendor/github.com/parquet-go/parquet-go/column_index_le.go index 4d8fec4511a87..6b6ac8f30b382 100644 --- a/vendor/github.com/parquet-go/parquet-go/column_index_le.go +++ b/vendor/github.com/parquet-go/parquet-go/column_index_le.go @@ -6,755 +6,33 @@ package parquet import ( "github.com/parquet-go/parquet-go/deprecated" - "github.com/parquet-go/parquet-go/encoding/plain" - "github.com/parquet-go/parquet-go/format" "github.com/parquet-go/parquet-go/internal/unsafecast" ) -type ColumnIndex interface { - // NumPages returns the number of paged in the column index. - NumPages() int - - // Returns the number of null values in the page at the given index. - NullCount(int) int64 - - // Tells whether the page at the given index contains null values only. - NullPage(int) bool - - // PageIndex return min/max bounds for the page at the given index in the - // column. - MinValue(int) Value - MaxValue(int) Value - - // IsAscending returns true if the column index min/max values are sorted - // in ascending order (based on the ordering rules of the column's logical - // type). - IsAscending() bool - - // IsDescending returns true if the column index min/max values are sorted - // in descending order (based on the ordering rules of the column's logical - // type). - IsDescending() bool -} - -// NewColumnIndex constructs a ColumnIndex instance from the given parquet -// format column index. The kind argument configures the type of values -func NewColumnIndex(kind Kind, index *format.ColumnIndex) ColumnIndex { - return &formatColumnIndex{ - kind: kind, - index: index, - } -} - -type formatColumnIndex struct { - kind Kind - index *format.ColumnIndex -} - -func (f *formatColumnIndex) NumPages() int { - return len(f.index.MinValues) -} - -func (f *formatColumnIndex) NullCount(i int) int64 { - if len(f.index.NullCounts) > 0 { - return f.index.NullCounts[i] - } - return 0 -} - -func (f *formatColumnIndex) NullPage(i int) bool { - return len(f.index.NullPages) > 0 && f.index.NullPages[i] -} - -func (f *formatColumnIndex) MinValue(i int) Value { - if f.NullPage(i) { - return Value{} - } - return f.kind.Value(f.index.MinValues[i]) -} - -func (f *formatColumnIndex) MaxValue(i int) Value { - if f.NullPage(i) { - return Value{} - } - return f.kind.Value(f.index.MaxValues[i]) -} - -func (f *formatColumnIndex) IsAscending() bool { - return f.index.BoundaryOrder == format.Ascending -} - -func (f *formatColumnIndex) IsDescending() bool { - return f.index.BoundaryOrder == format.Descending -} - -type fileColumnIndex struct{ chunk *fileColumnChunk } - -func (i fileColumnIndex) NumPages() int { - return len(i.columnIndex().NullPages) -} - -func (i fileColumnIndex) NullCount(j int) int64 { - index := i.columnIndex() - if len(index.NullCounts) > 0 { - return index.NullCounts[j] - } - return 0 -} - -func (i fileColumnIndex) NullPage(j int) bool { - return isNullPage(j, i.columnIndex()) -} - -func (i fileColumnIndex) MinValue(j int) Value { - index := i.columnIndex() - if isNullPage(j, index) { - return Value{} - } - return i.makeValue(index.MinValues[j]) -} - -func (i fileColumnIndex) MaxValue(j int) Value { - index := i.columnIndex() - if isNullPage(j, index) { - return Value{} - } - return i.makeValue(index.MaxValues[j]) -} - -func (i fileColumnIndex) IsAscending() bool { - return i.columnIndex().BoundaryOrder == format.Ascending -} - -func (i fileColumnIndex) IsDescending() bool { - return i.columnIndex().BoundaryOrder == format.Descending -} - -func (i *fileColumnIndex) makeValue(b []byte) Value { - return i.chunk.column.typ.Kind().Value(b) -} - -func (i fileColumnIndex) columnIndex() *format.ColumnIndex { return i.chunk.columnIndex.Load() } - -func isNullPage(j int, index *format.ColumnIndex) bool { - return len(index.NullPages) > 0 && index.NullPages[j] -} - -type emptyColumnIndex struct{} - -func (emptyColumnIndex) NumPages() int { return 0 } -func (emptyColumnIndex) NullCount(int) int64 { return 0 } -func (emptyColumnIndex) NullPage(int) bool { return false } -func (emptyColumnIndex) MinValue(int) Value { return Value{} } -func (emptyColumnIndex) MaxValue(int) Value { return Value{} } -func (emptyColumnIndex) IsAscending() bool { return false } -func (emptyColumnIndex) IsDescending() bool { return false } - -type booleanColumnIndex struct{ page *booleanPage } - -func (i booleanColumnIndex) NumPages() int { return 1 } -func (i booleanColumnIndex) NullCount(int) int64 { return 0 } -func (i booleanColumnIndex) NullPage(int) bool { return false } -func (i booleanColumnIndex) MinValue(int) Value { return makeValueBoolean(i.page.min()) } -func (i booleanColumnIndex) MaxValue(int) Value { return makeValueBoolean(i.page.max()) } -func (i booleanColumnIndex) IsAscending() bool { return false } -func (i booleanColumnIndex) IsDescending() bool { return false } - -type int32ColumnIndex struct{ page *int32Page } - -func (i int32ColumnIndex) NumPages() int { return 1 } -func (i int32ColumnIndex) NullCount(int) int64 { return 0 } -func (i int32ColumnIndex) NullPage(int) bool { return false } -func (i int32ColumnIndex) MinValue(int) Value { return makeValueInt32(i.page.min()) } -func (i int32ColumnIndex) MaxValue(int) Value { return makeValueInt32(i.page.max()) } -func (i int32ColumnIndex) IsAscending() bool { return false } -func (i int32ColumnIndex) IsDescending() bool { return false } - -type int64ColumnIndex struct{ page *int64Page } - -func (i int64ColumnIndex) NumPages() int { return 1 } -func (i int64ColumnIndex) NullCount(int) int64 { return 0 } -func (i int64ColumnIndex) NullPage(int) bool { return false } -func (i int64ColumnIndex) MinValue(int) Value { return makeValueInt64(i.page.min()) } -func (i int64ColumnIndex) MaxValue(int) Value { return makeValueInt64(i.page.max()) } -func (i int64ColumnIndex) IsAscending() bool { return false } -func (i int64ColumnIndex) IsDescending() bool { return false } - -type int96ColumnIndex struct{ page *int96Page } - -func (i int96ColumnIndex) NumPages() int { return 1 } -func (i int96ColumnIndex) NullCount(int) int64 { return 0 } -func (i int96ColumnIndex) NullPage(int) bool { return false } -func (i int96ColumnIndex) MinValue(int) Value { return makeValueInt96(i.page.min()) } -func (i int96ColumnIndex) MaxValue(int) Value { return makeValueInt96(i.page.max()) } -func (i int96ColumnIndex) IsAscending() bool { return false } -func (i int96ColumnIndex) IsDescending() bool { return false } - -type floatColumnIndex struct{ page *floatPage } - -func (i floatColumnIndex) NumPages() int { return 1 } -func (i floatColumnIndex) NullCount(int) int64 { return 0 } -func (i floatColumnIndex) NullPage(int) bool { return false } -func (i floatColumnIndex) MinValue(int) Value { return makeValueFloat(i.page.min()) } -func (i floatColumnIndex) MaxValue(int) Value { return makeValueFloat(i.page.max()) } -func (i floatColumnIndex) IsAscending() bool { return false } -func (i floatColumnIndex) IsDescending() bool { return false } - -type doubleColumnIndex struct{ page *doublePage } - -func (i doubleColumnIndex) NumPages() int { return 1 } -func (i doubleColumnIndex) NullCount(int) int64 { return 0 } -func (i doubleColumnIndex) NullPage(int) bool { return false } -func (i doubleColumnIndex) MinValue(int) Value { return makeValueDouble(i.page.min()) } -func (i doubleColumnIndex) MaxValue(int) Value { return makeValueDouble(i.page.max()) } -func (i doubleColumnIndex) IsAscending() bool { return false } -func (i doubleColumnIndex) IsDescending() bool { return false } - -type byteArrayColumnIndex struct{ page *byteArrayPage } - -func (i byteArrayColumnIndex) NumPages() int { return 1 } -func (i byteArrayColumnIndex) NullCount(int) int64 { return 0 } -func (i byteArrayColumnIndex) NullPage(int) bool { return false } -func (i byteArrayColumnIndex) MinValue(int) Value { return makeValueBytes(ByteArray, i.page.min()) } -func (i byteArrayColumnIndex) MaxValue(int) Value { return makeValueBytes(ByteArray, i.page.max()) } -func (i byteArrayColumnIndex) IsAscending() bool { return false } -func (i byteArrayColumnIndex) IsDescending() bool { return false } - -type fixedLenByteArrayColumnIndex struct{ page *fixedLenByteArrayPage } - -func (i fixedLenByteArrayColumnIndex) NumPages() int { return 1 } -func (i fixedLenByteArrayColumnIndex) NullCount(int) int64 { return 0 } -func (i fixedLenByteArrayColumnIndex) NullPage(int) bool { return false } -func (i fixedLenByteArrayColumnIndex) MinValue(int) Value { - return makeValueBytes(FixedLenByteArray, i.page.min()) -} -func (i fixedLenByteArrayColumnIndex) MaxValue(int) Value { - return makeValueBytes(FixedLenByteArray, i.page.max()) -} -func (i fixedLenByteArrayColumnIndex) IsAscending() bool { return false } -func (i fixedLenByteArrayColumnIndex) IsDescending() bool { return false } - -type uint32ColumnIndex struct{ page *uint32Page } - -func (i uint32ColumnIndex) NumPages() int { return 1 } -func (i uint32ColumnIndex) NullCount(int) int64 { return 0 } -func (i uint32ColumnIndex) NullPage(int) bool { return false } -func (i uint32ColumnIndex) MinValue(int) Value { return makeValueUint32(i.page.min()) } -func (i uint32ColumnIndex) MaxValue(int) Value { return makeValueUint32(i.page.max()) } -func (i uint32ColumnIndex) IsAscending() bool { return false } -func (i uint32ColumnIndex) IsDescending() bool { return false } - -type uint64ColumnIndex struct{ page *uint64Page } - -func (i uint64ColumnIndex) NumPages() int { return 1 } -func (i uint64ColumnIndex) NullCount(int) int64 { return 0 } -func (i uint64ColumnIndex) NullPage(int) bool { return false } -func (i uint64ColumnIndex) MinValue(int) Value { return makeValueUint64(i.page.min()) } -func (i uint64ColumnIndex) MaxValue(int) Value { return makeValueUint64(i.page.max()) } -func (i uint64ColumnIndex) IsAscending() bool { return false } -func (i uint64ColumnIndex) IsDescending() bool { return false } - -type be128ColumnIndex struct{ page *be128Page } - -func (i be128ColumnIndex) NumPages() int { return 1 } -func (i be128ColumnIndex) NullCount(int) int64 { return 0 } -func (i be128ColumnIndex) NullPage(int) bool { return false } -func (i be128ColumnIndex) MinValue(int) Value { return makeValueBytes(FixedLenByteArray, i.page.min()) } -func (i be128ColumnIndex) MaxValue(int) Value { return makeValueBytes(FixedLenByteArray, i.page.max()) } -func (i be128ColumnIndex) IsAscending() bool { return false } -func (i be128ColumnIndex) IsDescending() bool { return false } - -// The ColumnIndexer interface is implemented by types that support generating -// parquet column indexes. -// -// The package does not export any types that implement this interface, programs -// must call NewColumnIndexer on a Type instance to construct column indexers. -type ColumnIndexer interface { - // Resets the column indexer state. - Reset() - - // Add a page to the column indexer. - IndexPage(numValues, numNulls int64, min, max Value) - - // Generates a format.ColumnIndex value from the current state of the - // column indexer. - // - // The returned value may reference internal buffers, in which case the - // values remain valid until the next call to IndexPage or Reset on the - // column indexer. - ColumnIndex() format.ColumnIndex -} - -type baseColumnIndexer struct { - nullPages []bool - nullCounts []int64 -} - -func (i *baseColumnIndexer) reset() { - i.nullPages = i.nullPages[:0] - i.nullCounts = i.nullCounts[:0] -} - -func (i *baseColumnIndexer) observe(numValues, numNulls int64) { - i.nullPages = append(i.nullPages, numValues == numNulls) - i.nullCounts = append(i.nullCounts, numNulls) -} - -func (i *baseColumnIndexer) columnIndex(minValues, maxValues [][]byte, minOrder, maxOrder int) format.ColumnIndex { - nullPages := make([]bool, len(i.nullPages)) - copy(nullPages, i.nullPages) - nullCounts := make([]int64, len(i.nullCounts)) - copy(nullCounts, i.nullCounts) - return format.ColumnIndex{ - NullPages: nullPages, - NullCounts: nullCounts, - MinValues: minValues, - MaxValues: maxValues, - BoundaryOrder: boundaryOrderOf(minOrder, maxOrder), - } -} - -type booleanColumnIndexer struct { - baseColumnIndexer - minValues []bool - maxValues []bool -} - -func newBooleanColumnIndexer() *booleanColumnIndexer { - return new(booleanColumnIndexer) -} - -func (i *booleanColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *booleanColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.boolean()) - i.maxValues = append(i.maxValues, max.boolean()) -} - -func (i *booleanColumnIndexer) ColumnIndex() format.ColumnIndex { - return i.columnIndex( - splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 1), - splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 1), - orderOfBool(i.minValues), - orderOfBool(i.maxValues), - ) -} - -type int32ColumnIndexer struct { - baseColumnIndexer - minValues []int32 - maxValues []int32 -} - -func newInt32ColumnIndexer() *int32ColumnIndexer { - return new(int32ColumnIndexer) -} - -func (i *int32ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *int32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.int32()) - i.maxValues = append(i.maxValues, max.int32()) -} - -func (i *int32ColumnIndexer) ColumnIndex() format.ColumnIndex { - return i.columnIndex( - splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 4), - splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 4), - orderOfInt32(i.minValues), - orderOfInt32(i.maxValues), - ) -} - -type int64ColumnIndexer struct { - baseColumnIndexer - minValues []int64 - maxValues []int64 -} - -func newInt64ColumnIndexer() *int64ColumnIndexer { - return new(int64ColumnIndexer) -} - -func (i *int64ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *int64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.int64()) - i.maxValues = append(i.maxValues, max.int64()) -} - -func (i *int64ColumnIndexer) ColumnIndex() format.ColumnIndex { - return i.columnIndex( - splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 8), - splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 8), - orderOfInt64(i.minValues), - orderOfInt64(i.maxValues), - ) -} - -type int96ColumnIndexer struct { - baseColumnIndexer - minValues []deprecated.Int96 - maxValues []deprecated.Int96 -} - -func newInt96ColumnIndexer() *int96ColumnIndexer { - return new(int96ColumnIndexer) -} - -func (i *int96ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *int96ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.Int96()) - i.maxValues = append(i.maxValues, max.Int96()) -} - -func (i *int96ColumnIndexer) ColumnIndex() format.ColumnIndex { - return i.columnIndex( - splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 12), - splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 12), - deprecated.OrderOfInt96(i.minValues), - deprecated.OrderOfInt96(i.maxValues), - ) -} - -type floatColumnIndexer struct { - baseColumnIndexer - minValues []float32 - maxValues []float32 -} - -func newFloatColumnIndexer() *floatColumnIndexer { - return new(floatColumnIndexer) -} - -func (i *floatColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *floatColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.float()) - i.maxValues = append(i.maxValues, max.float()) -} - -func (i *floatColumnIndexer) ColumnIndex() format.ColumnIndex { - return i.columnIndex( - splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 4), - splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 4), - orderOfFloat32(i.minValues), - orderOfFloat32(i.maxValues), - ) -} - -type doubleColumnIndexer struct { - baseColumnIndexer - minValues []float64 - maxValues []float64 -} - -func newDoubleColumnIndexer() *doubleColumnIndexer { - return new(doubleColumnIndexer) -} - -func (i *doubleColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *doubleColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.double()) - i.maxValues = append(i.maxValues, max.double()) -} - -func (i *doubleColumnIndexer) ColumnIndex() format.ColumnIndex { - return i.columnIndex( - splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 8), - splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 8), - orderOfFloat64(i.minValues), - orderOfFloat64(i.maxValues), - ) -} - -type byteArrayColumnIndexer struct { - baseColumnIndexer - sizeLimit int - minValues []byte - maxValues []byte -} - -func newByteArrayColumnIndexer(sizeLimit int) *byteArrayColumnIndexer { - return &byteArrayColumnIndexer{sizeLimit: sizeLimit} -} - -func (i *byteArrayColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *byteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = plain.AppendByteArray(i.minValues, min.byteArray()) - i.maxValues = plain.AppendByteArray(i.maxValues, max.byteArray()) -} - -func (i *byteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { - minValues := splitByteArrays(i.minValues) - maxValues := splitByteArrays(i.maxValues) - if sizeLimit := i.sizeLimit; sizeLimit > 0 { - for i, v := range minValues { - minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) - } - for i, v := range maxValues { - maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) - } - } - return i.columnIndex( - minValues, - maxValues, - orderOfBytes(minValues), - orderOfBytes(maxValues), - ) -} - -type fixedLenByteArrayColumnIndexer struct { - baseColumnIndexer - size int - sizeLimit int - minValues []byte - maxValues []byte -} - -func newFixedLenByteArrayColumnIndexer(size, sizeLimit int) *fixedLenByteArrayColumnIndexer { - return &fixedLenByteArrayColumnIndexer{ - size: size, - sizeLimit: sizeLimit, - } -} - -func (i *fixedLenByteArrayColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *fixedLenByteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.byteArray()...) - i.maxValues = append(i.maxValues, max.byteArray()...) -} - -func (i *fixedLenByteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { - minValues := splitFixedLenByteArrays(i.minValues, i.size) - maxValues := splitFixedLenByteArrays(i.maxValues, i.size) - if sizeLimit := i.sizeLimit; sizeLimit > 0 { - for i, v := range minValues { - minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) - } - for i, v := range maxValues { - maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) - } - } - return i.columnIndex( - minValues, - maxValues, - orderOfBytes(minValues), - orderOfBytes(maxValues), - ) -} - -type uint32ColumnIndexer struct { - baseColumnIndexer - minValues []uint32 - maxValues []uint32 -} - -func newUint32ColumnIndexer() *uint32ColumnIndexer { - return new(uint32ColumnIndexer) -} - -func (i *uint32ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *uint32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.uint32()) - i.maxValues = append(i.maxValues, max.uint32()) -} - -func (i *uint32ColumnIndexer) ColumnIndex() format.ColumnIndex { - return i.columnIndex( - splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 4), - splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 4), - orderOfUint32(i.minValues), - orderOfUint32(i.maxValues), - ) -} - -type uint64ColumnIndexer struct { - baseColumnIndexer - minValues []uint64 - maxValues []uint64 -} - -func newUint64ColumnIndexer() *uint64ColumnIndexer { - return new(uint64ColumnIndexer) -} - -func (i *uint64ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *uint64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - i.minValues = append(i.minValues, min.uint64()) - i.maxValues = append(i.maxValues, max.uint64()) -} - -func (i *uint64ColumnIndexer) ColumnIndex() format.ColumnIndex { - return i.columnIndex( - splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 8), - splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 8), - orderOfUint64(i.minValues), - orderOfUint64(i.maxValues), - ) -} - -type be128ColumnIndexer struct { - baseColumnIndexer - minValues [][16]byte - maxValues [][16]byte -} - -func newBE128ColumnIndexer() *be128ColumnIndexer { - return new(be128ColumnIndexer) -} - -func (i *be128ColumnIndexer) Reset() { - i.reset() - i.minValues = i.minValues[:0] - i.maxValues = i.maxValues[:0] -} - -func (i *be128ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { - i.observe(numValues, numNulls) - if !min.IsNull() { - i.minValues = append(i.minValues, *(*[16]byte)(min.byteArray())) - } - if !max.IsNull() { - i.maxValues = append(i.maxValues, *(*[16]byte)(max.byteArray())) - } -} - -func (i *be128ColumnIndexer) ColumnIndex() format.ColumnIndex { - minValues := splitFixedLenByteArrays(unsafecast.Slice[byte](i.minValues), 16) - maxValues := splitFixedLenByteArrays(unsafecast.Slice[byte](i.maxValues), 16) - return i.columnIndex( - minValues, - maxValues, - orderOfBytes(minValues), - orderOfBytes(maxValues), - ) +func columnIndexInt32Values(values []int32) []byte { + return unsafecast.Slice[byte](values) } -func truncateLargeMinByteArrayValue(value []byte, sizeLimit int) []byte { - if len(value) > sizeLimit { - value = value[:sizeLimit] - } - return value +func columnIndexInt64Values(values []int64) []byte { + return unsafecast.Slice[byte](values) } -// truncateLargeMaxByteArrayValue truncates the given byte array to the given size limit. -// If the given byte array is truncated, it is incremented by 1 in place. -func truncateLargeMaxByteArrayValue(value []byte, sizeLimit int) []byte { - if len(value) > sizeLimit { - value = value[:sizeLimit] - incrementByteArrayInplace(value) - } - return value +func columnIndexInt96Values(values []deprecated.Int96) []byte { + return unsafecast.Slice[byte](values) } -// incrementByteArray increments the given byte array by 1. -// Reference: https://github.com/apache/parquet-java/blob/master/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java#L124 -func incrementByteArrayInplace(value []byte) { - for i := len(value) - 1; i >= 0; i-- { - value[i]++ - if value[i] != 0 { // Did not overflow: 0xFF -> 0x00 - return - } - } - // Fully overflowed, so restore all to 0xFF - for i := range value { - value[i] = 0xFF - } +func columnIndexFloatValues(values []float32) []byte { + return unsafecast.Slice[byte](values) } -func splitByteArrays(data []byte) [][]byte { - length := 0 - plain.RangeByteArray(data, func([]byte) error { - length++ - return nil - }) - buffer := make([]byte, 0, len(data)-(4*length)) - values := make([][]byte, 0, length) - plain.RangeByteArray(data, func(value []byte) error { - offset := len(buffer) - buffer = append(buffer, value...) - values = append(values, buffer[offset:]) - return nil - }) - return values +func columnIndexDoubleValues(values []float64) []byte { + return unsafecast.Slice[byte](values) } -func splitFixedLenByteArrays(data []byte, size int) [][]byte { - data = copyBytes(data) - values := make([][]byte, len(data)/size) - for i := range values { - j := (i + 0) * size - k := (i + 1) * size - values[i] = data[j:k:k] - } - return values +func columnIndexUint32Values(values []uint32) []byte { + return unsafecast.Slice[byte](values) } -func boundaryOrderOf(minOrder, maxOrder int) format.BoundaryOrder { - if minOrder == maxOrder { - switch { - case minOrder > 0: - return format.Ascending - case minOrder < 0: - return format.Descending - } - } - return format.Unordered +func columnIndexUint64Values(values []uint64) []byte { + return unsafecast.Slice[byte](values) } diff --git a/vendor/github.com/parquet-go/parquet-go/config.go b/vendor/github.com/parquet-go/parquet-go/config.go index 63922d8e3e831..45ff8bab2bb04 100644 --- a/vendor/github.com/parquet-go/parquet-go/config.go +++ b/vendor/github.com/parquet-go/parquet-go/config.go @@ -25,6 +25,7 @@ const ( DefaultWriteBufferSize = 32 * 1024 DefaultDataPageVersion = 2 DefaultDataPageStatistics = false + DefaultSkipMagicBytes = false DefaultSkipPageIndex = false DefaultSkipBloomFilters = false DefaultMaxRowsPerRowGroup = math.MaxInt64 @@ -90,8 +91,10 @@ func formatCreatedBy(application, version, build string) string { // ReadMode: ReadModeAsync, // }) type FileConfig struct { + SkipMagicBytes bool SkipPageIndex bool SkipBloomFilters bool + OptimisticRead bool ReadBufferSize int ReadMode ReadMode Schema *Schema @@ -101,6 +104,7 @@ type FileConfig struct { // default file configuration. func DefaultFileConfig() *FileConfig { return &FileConfig{ + SkipMagicBytes: DefaultSkipMagicBytes, SkipPageIndex: DefaultSkipPageIndex, SkipBloomFilters: DefaultSkipBloomFilters, ReadBufferSize: defaultReadBufferSize, @@ -130,6 +134,7 @@ func (c *FileConfig) Apply(options ...FileOption) { // ConfigureFile applies configuration options from c to config. func (c *FileConfig) ConfigureFile(config *FileConfig) { *config = FileConfig{ + SkipMagicBytes: c.SkipMagicBytes, SkipPageIndex: c.SkipPageIndex, SkipBloomFilters: c.SkipBloomFilters, ReadBufferSize: coalesceInt(c.ReadBufferSize, config.ReadBufferSize), @@ -435,6 +440,14 @@ type SortingOption interface { ConfigureSorting(*SortingConfig) } +// SkipMagicBytes is a file configuration option which prevents automatically +// reading the magic bytes when opening a parquet file, when set to true. This +// is useful as an optimization when programs can trust that they are dealing +// with parquet files and do not need to verify the first 4 bytes. +func SkipMagicBytes(skip bool) FileOption { + return fileOption(func(config *FileConfig) { config.SkipMagicBytes = skip }) +} + // SkipPageIndex is a file configuration option which prevents automatically // reading the page index when opening a parquet file, when set to true. This is // useful as an optimization when programs know that they will not need to @@ -455,6 +468,17 @@ func SkipBloomFilters(skip bool) FileOption { return fileOption(func(config *FileConfig) { config.SkipBloomFilters = skip }) } +// OptimisticRead configures a file to optimistically perform larger buffered +// reads to improve performance. This is useful when reading from remote storage +// and amortize the cost of network round trips. +// +// This is an option instead of enabled by default because dependents of this +// package have historically relied on the read patterns to provide external +// caches and achieve similar results (e.g., Tempo). +func OptimisticRead(enabled bool) FileOption { + return fileOption(func(config *FileConfig) { config.OptimisticRead = enabled }) +} + // FileReadMode is a file configuration option which controls the way pages // are read. Currently the only two options are ReadModeAsync and ReadModeSync // which control whether or not pages are loaded asynchronously. It can be diff --git a/vendor/github.com/parquet-go/parquet-go/convert.go b/vendor/github.com/parquet-go/parquet-go/convert.go index 96a49a81f6376..df1f7f43cb329 100644 --- a/vendor/github.com/parquet-go/parquet-go/convert.go +++ b/vendor/github.com/parquet-go/parquet-go/convert.go @@ -86,7 +86,7 @@ func convertToSelf(column []Value) error { return nil } func convertToType(targetType, sourceType Type) conversionFunc { return func(column []Value) error { for i, v := range column { - v, err := sourceType.ConvertValue(v, targetType) + v, err := targetType.ConvertValue(v, sourceType) if err != nil { return err } @@ -249,7 +249,7 @@ func Convert(to, from Node) (conv Conversion, err error) { schema = NewSchema("", to) } - if nodesAreEqual(to, from) { + if EqualNodes(to, from) { return identity{schema}, nil } diff --git a/vendor/github.com/parquet-go/parquet-go/dedupe.go b/vendor/github.com/parquet-go/parquet-go/dedupe.go index 9355488d3c13d..0f434396753c3 100644 --- a/vendor/github.com/parquet-go/parquet-go/dedupe.go +++ b/vendor/github.com/parquet-go/parquet-go/dedupe.go @@ -66,14 +66,12 @@ func (d *dedupeRowWriter) WriteRows(rows []Row) (int, error) { } type dedupe struct { - alloc rowAllocator lastRow Row uniq []Row dupe []Row } func (d *dedupe) reset() { - d.alloc.reset() d.lastRow = d.lastRow[:0] } @@ -104,8 +102,6 @@ func (d *dedupe) deduplicate(rows []Row, compare func(Row, Row) int) int { rows = append(rows, d.uniq...) rows = append(rows, d.dupe...) - d.alloc.reset() - d.alloc.capture(lastRow) d.lastRow = append(d.lastRow[:0], lastRow...) return len(d.uniq) } diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/thrift/encode.go b/vendor/github.com/parquet-go/parquet-go/encoding/thrift/encode.go index bd2c3a98039af..9d6cfd2e5510b 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/thrift/encode.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/thrift/encode.go @@ -2,10 +2,11 @@ package thrift import ( "bytes" + "cmp" "fmt" "math" "reflect" - "sort" + "slices" "sync/atomic" ) @@ -360,8 +361,8 @@ func encodeFuncStructOf(t reflect.Type, seen encodeFuncCache) encodeFunc { } }) - sort.SliceStable(enc.fields, func(i, j int) bool { - return enc.fields[i].id < enc.fields[j].id + slices.SortStableFunc(enc.fields, func(a, b structEncoderField) int { + return cmp.Compare(a.id, b.id) }) for i := len(enc.fields) - 1; i > 0; i-- { diff --git a/vendor/github.com/parquet-go/parquet-go/errors.go b/vendor/github.com/parquet-go/parquet-go/errors.go index bcef1faf52786..651fe740c88ee 100644 --- a/vendor/github.com/parquet-go/parquet-go/errors.go +++ b/vendor/github.com/parquet-go/parquet-go/errors.go @@ -59,6 +59,10 @@ var ( // cannot be done because there are no rules to translate between their // physical types. ErrInvalidConversion = errors.New("invalid conversion between parquet values") + + // ErrMalformedRepetitionLevel is returned when a page reader encounters + // a repetition level which does not start at the beginning of a row. + ErrMalformedRepetitionLevel = errors.New("parquet-go encountered a malformed data page which does not start at the beginning of a row") ) type errno int diff --git a/vendor/github.com/parquet-go/parquet-go/file.go b/vendor/github.com/parquet-go/parquet-go/file.go index 384042b308ca9..48902091b0d18 100644 --- a/vendor/github.com/parquet-go/parquet-go/file.go +++ b/vendor/github.com/parquet-go/parquet-go/file.go @@ -6,6 +6,7 @@ import ( "fmt" "hash/crc32" "io" + "slices" "sort" "strings" "sync" @@ -35,6 +36,18 @@ type File struct { config *FileConfig } +type FileView interface { + Metadata() *format.FileMetaData + Schema() *Schema + NumRows() int64 + Lookup(key string) (string, bool) + Size() int64 + Root() *Column + RowGroups() []RowGroup + ColumnIndexes() []format.ColumnIndex + OffsetIndexes() []format.OffsetIndex +} + // OpenFile opens a parquet file and reads the content between offset 0 and the given // size in r. // @@ -42,39 +55,64 @@ type File struct { // parts of the file are left untouched; this means that successfully opening // a file does not validate that the pages have valid checksums. func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) { - b := make([]byte, 8) c, err := NewFileConfig(options...) if err != nil { return nil, err } f := &File{reader: r, size: size, config: c} - if _, err := readAt(r, b[:4], 0); err != nil { - return nil, fmt.Errorf("reading magic header of parquet file: %w", err) - } - if string(b[:4]) != "PAR1" { - return nil, fmt.Errorf("invalid magic header of parquet file: %q", b[:4]) + if !c.SkipMagicBytes { + var b [4]byte + if _, err := readAt(r, b[:4], 0); err != nil { + return nil, fmt.Errorf("reading magic header of parquet file: %w", err) + } + if string(b[:4]) != "PAR1" { + return nil, fmt.Errorf("invalid magic header of parquet file: %q", b[:4]) + } } if cast, ok := f.reader.(interface{ SetMagicFooterSection(offset, length int64) }); ok { cast.SetMagicFooterSection(size-8, 8) } - if n, err := r.ReadAt(b[:8], size-8); n != 8 { - return nil, fmt.Errorf("reading magic footer of parquet file: %w", err) + + optimisticRead := c.OptimisticRead + optimisticFooterSize := min(int64(c.ReadBufferSize), size) + if !optimisticRead || optimisticFooterSize < 8 { + optimisticFooterSize = 8 + } + optimisticFooterData := make([]byte, optimisticFooterSize) + if optimisticRead { + f.reader = &optimisticFileReaderAt{ + reader: f.reader, + offset: size - optimisticFooterSize, + footer: optimisticFooterData, + } + } + + if n, err := readAt(r, optimisticFooterData, size-optimisticFooterSize); n != len(optimisticFooterData) { + return nil, fmt.Errorf("reading magic footer of parquet file: %w (read: %d)", err, n) } - if string(b[4:8]) != "PAR1" { - return nil, fmt.Errorf("invalid magic footer of parquet file: %q", b[4:8]) + optimisticFooterSize -= 8 + b := optimisticFooterData[optimisticFooterSize:] + if string(b[4:]) != "PAR1" { + return nil, fmt.Errorf("invalid magic footer of parquet file: %q", b[4:]) } footerSize := int64(binary.LittleEndian.Uint32(b[:4])) - footerData := make([]byte, footerSize) + footerData := []byte(nil) - if cast, ok := f.reader.(interface{ SetFooterSection(offset, length int64) }); ok { - cast.SetFooterSection(size-(footerSize+8), footerSize) - } - if _, err := f.readAt(footerData, size-(footerSize+8)); err != nil { - return nil, fmt.Errorf("reading footer of parquet file: %w", err) + if footerSize <= optimisticFooterSize { + footerData = optimisticFooterData[optimisticFooterSize-footerSize : optimisticFooterSize] + } else { + footerData = make([]byte, footerSize) + if cast, ok := f.reader.(interface{ SetFooterSection(offset, length int64) }); ok { + cast.SetFooterSection(size-(footerSize+8), footerSize) + } + if _, err := f.readAt(footerData, size-(footerSize+8)); err != nil { + return nil, fmt.Errorf("reading footer of parquet file: %w", err) + } } + if err := thrift.Unmarshal(&f.protocol, footerData, &f.metadata); err != nil { return nil, fmt.Errorf("reading parquet file metadata: %w", err) } @@ -88,28 +126,18 @@ func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) { } } - if f.root, err = openColumns(f); err != nil { + if f.root, err = openColumns(f, &f.metadata, f.columnIndexes, f.offsetIndexes); err != nil { return nil, fmt.Errorf("opening columns of parquet file: %w", err) } - var schema *Schema if c.Schema != nil { - schema = c.Schema + f.schema = c.Schema } else { - schema = NewSchema(f.root.Name(), f.root) - } - columns := make([]*Column, 0, numLeafColumnsOf(f.root)) - f.schema = schema - f.root.forEachLeaf(func(c *Column) { columns = append(columns, c) }) - - rowGroups := make([]fileRowGroup, len(f.metadata.RowGroups)) - for i := range rowGroups { - rowGroups[i].init(f, schema, columns, &f.metadata.RowGroups[i]) - } - f.rowGroups = make([]RowGroup, len(rowGroups)) - for i := range rowGroups { - f.rowGroups[i] = &rowGroups[i] + f.schema = NewSchema(f.root.Name(), f.root) } + columns := makeLeafColumns(f.root) + rowGroups := makeFileRowGroups(f, columns) + f.rowGroups = makeRowGroups(rowGroups) if !c.SkipBloomFilters { section := io.NewSectionReader(r, 0, size) @@ -124,7 +152,7 @@ func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) { g := &rowGroups[i] for j := range g.columns { - c := g.columns[j].(*fileColumnChunk) + c := g.columns[j].(*FileColumnChunk) if offset := c.chunk.MetaData.BloomFilterOffset; offset > 0 { section.Seek(offset, io.SeekStart) @@ -144,13 +172,14 @@ func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) { cast.SetBloomFilterSection(bloomFilterOffset, bloomFilterLength) } - c.bloomFilter = newBloomFilter(r, offset, &header) + c.bloomFilter.Store(newBloomFilter(r, offset, &header)) } } } } sortKeyValueMetadata(f.metadata.KeyValueMetadata) + f.reader = r // restore in case an optimistic reader was used return f, nil } @@ -283,6 +312,8 @@ func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, erro func (f *File) NumRows() int64 { return f.metadata.NumRows } // RowGroups returns the list of row groups in the file. +// +// Elements of the returned slice are guaranteed to be of type *FileRowGroup. func (f *File) RowGroups() []RowGroup { return f.rowGroups } // Root returns the root column of f. @@ -318,14 +349,13 @@ func (f *File) ReadAt(b []byte, off int64) (int, error) { // ColumnIndexes returns the page index of the parquet file f. // -// If the file did not contain a column index, the method returns an empty slice -// and nil error. +// If the file did not contain a column index, the method returns an empty slice. func (f *File) ColumnIndexes() []format.ColumnIndex { return f.columnIndexes } // OffsetIndexes returns the page index of the parquet file f. // // If the file did not contain an offset index, the method returns an empty -// slice and nil error. +// slice. func (f *File) OffsetIndexes() []format.OffsetIndex { return f.offsetIndexes } // Lookup returns the value associated with the given key in the file key/value @@ -343,46 +373,44 @@ func (f *File) hasIndexes() bool { var _ io.ReaderAt = (*File)(nil) func sortKeyValueMetadata(keyValueMetadata []format.KeyValue) { - sort.Slice(keyValueMetadata, func(i, j int) bool { - switch { - case keyValueMetadata[i].Key < keyValueMetadata[j].Key: - return true - case keyValueMetadata[i].Key > keyValueMetadata[j].Key: - return false - default: - return keyValueMetadata[i].Value < keyValueMetadata[j].Value + slices.SortFunc(keyValueMetadata, func(a, b format.KeyValue) int { + if cmp := strings.Compare(a.Key, b.Key); cmp != 0 { + return cmp } + return strings.Compare(a.Value, b.Value) }) } func lookupKeyValueMetadata(keyValueMetadata []format.KeyValue, key string) (value string, ok bool) { - i := sort.Search(len(keyValueMetadata), func(i int) bool { - return keyValueMetadata[i].Key >= key + i, found := slices.BinarySearchFunc(keyValueMetadata, key, func(kv format.KeyValue, key string) int { + return strings.Compare(kv.Key, key) }) - if i == len(keyValueMetadata) || keyValueMetadata[i].Key != key { - return "", false + if found { + return keyValueMetadata[i].Value, true } - return keyValueMetadata[i].Value, true + return "", false } -type fileRowGroup struct { - schema *Schema +// FileRowGroup is an implementation of the RowGroup interface on parquet files +// returned by OpenFile. +type FileRowGroup struct { + file *File rowGroup *format.RowGroup columns []ColumnChunk sorting []SortingColumn - config *FileConfig } -func (g *fileRowGroup) init(file *File, schema *Schema, columns []*Column, rowGroup *format.RowGroup) { - g.schema = schema +func (g *FileRowGroup) init(file *File, columns []*Column, rowGroup *format.RowGroup) { + g.file = file g.rowGroup = rowGroup - g.config = file.config g.columns = make([]ColumnChunk, len(rowGroup.Columns)) g.sorting = make([]SortingColumn, len(rowGroup.SortingColumns)) - fileColumnChunks := make([]fileColumnChunk, len(rowGroup.Columns)) + fileColumnChunks := make([]FileColumnChunk, len(rowGroup.Columns)) + fileColumnIndexes := make([]FileColumnIndex, len(rowGroup.Columns)) + fileOffsetIndexes := make([]FileOffsetIndex, len(rowGroup.Columns)) for i := range g.columns { - fileColumnChunks[i] = fileColumnChunk{ + fileColumnChunks[i] = FileColumnChunk{ file: file, column: columns[i], rowGroup: rowGroup, @@ -391,8 +419,12 @@ func (g *fileRowGroup) init(file *File, schema *Schema, columns []*Column, rowGr if file.hasIndexes() { j := (int(rowGroup.Ordinal) * len(columns)) + i - fileColumnChunks[i].columnIndex.Store(&file.columnIndexes[j]) - fileColumnChunks[i].offsetIndex.Store(&file.offsetIndexes[j]) + + fileColumnIndexes[i] = FileColumnIndex{index: &file.columnIndexes[j], kind: columns[i].Type().Kind()} + fileOffsetIndexes[i] = FileOffsetIndex{index: &file.offsetIndexes[j]} + + fileColumnChunks[i].columnIndex.Store(&fileColumnIndexes[i]) + fileColumnChunks[i].offsetIndex.Store(&fileOffsetIndexes[i]) } g.columns[i] = &fileColumnChunks[i] @@ -407,11 +439,31 @@ func (g *fileRowGroup) init(file *File, schema *Schema, columns []*Column, rowGr } } -func (g *fileRowGroup) Schema() *Schema { return g.schema } -func (g *fileRowGroup) NumRows() int64 { return g.rowGroup.NumRows } -func (g *fileRowGroup) ColumnChunks() []ColumnChunk { return g.columns } -func (g *fileRowGroup) SortingColumns() []SortingColumn { return g.sorting } -func (g *fileRowGroup) Rows() Rows { return newRowGroupRows(g, g.config.ReadMode) } +// File returns the file that this row group belongs to. +func (g *FileRowGroup) File() *File { return g.file } + +// Schema returns the schema of the row group. +func (g *FileRowGroup) Schema() *Schema { return g.file.schema } + +// NumRows returns the number of rows in the row group. +func (g *FileRowGroup) NumRows() int64 { return g.rowGroup.NumRows } + +// ColumnChunks returns the list of column chunks in the row group. +// +// Elements of the returned slice are guaranteed to be of type *FileColumnChunk. +func (g *FileRowGroup) ColumnChunks() []ColumnChunk { return g.columns } + +// SortingColumns returns the list of sorting columns in the row group. +func (g *FileRowGroup) SortingColumns() []SortingColumn { return g.sorting } + +// Rows returns a row reader for the row group. +func (g *FileRowGroup) Rows() Rows { + rowGroup := RowGroup(g) + if g.file.config.ReadMode == ReadModeAsync { + rowGroup = AsyncRowGroup(rowGroup) + } + return NewRowGroupRowReader(rowGroup) +} type fileSortingColumn struct { column *Column @@ -437,82 +489,174 @@ func (s *fileSortingColumn) String() string { return b.String() } -type fileColumnChunk struct { +// FileColumnChunk is an implementation of the ColumnChunk interface on parquet +// files returned by OpenFile. +type FileColumnChunk struct { file *File column *Column - bloomFilter *bloomFilter rowGroup *format.RowGroup - columnIndex atomic.Pointer[format.ColumnIndex] - offsetIndex atomic.Pointer[format.OffsetIndex] chunk *format.ColumnChunk + columnIndex atomic.Pointer[FileColumnIndex] + offsetIndex atomic.Pointer[FileOffsetIndex] + bloomFilter atomic.Pointer[FileBloomFilter] } -func (c *fileColumnChunk) Type() Type { - return c.column.Type() +// File returns the file that this column chunk belongs to. +func (c *FileColumnChunk) File() *File { return c.file } + +// Node returns the node that this column chunk belongs to in the parquet schema. +func (c *FileColumnChunk) Node() Node { return c.column } + +// Type returns the type of the column chunk. +func (c *FileColumnChunk) Type() Type { return c.column.Type() } + +// Column returns the column index of this chunk in its parent row group. +func (c *FileColumnChunk) Column() int { return int(c.column.Index()) } + +// Bounds returns the min and max values found in the column chunk. +func (c *FileColumnChunk) Bounds() (min, max Value, ok bool) { + stats := &c.chunk.MetaData.Statistics + columnKind := c.Type().Kind() + hasMinValue := stats.MinValue != nil + hasMaxValue := stats.MaxValue != nil + if hasMinValue { + min = columnKind.Value(stats.MinValue) + } + if hasMaxValue { + max = columnKind.Value(stats.MaxValue) + } + return min, max, hasMinValue && hasMaxValue } -func (c *fileColumnChunk) Column() int { - return int(c.column.Index()) +// Pages returns a page reader for the column chunk. +func (c *FileColumnChunk) Pages() Pages { + pages := Pages(c.PagesFrom(c.file.reader)) + if c.file.config.ReadMode == ReadModeAsync { + pages = AsyncPages(pages) + } + return pages } -func (c *fileColumnChunk) Pages() Pages { - r := new(filePages) - r.init(c) - return r +// PagesFrom returns a page reader for the column chunk, using the reader passed +// as argument instead of the one that the file was originally opened from. +// +// Note that unlike when calling Pages, the returned reader is not wrapped in an +// AsyncPages reader if the file was opened in async mode. +func (c *FileColumnChunk) PagesFrom(reader io.ReaderAt) *FilePages { + pages := new(FilePages) + pages.init(c, reader) + return pages } -func (c *fileColumnChunk) ColumnIndex() (ColumnIndex, error) { - index, err := c.readColumnIndex() +// ColumnIndex returns the column index of the column chunk, or an error if it +// didn't exist or couldn't be read. +func (c *FileColumnChunk) ColumnIndex() (ColumnIndex, error) { + index, err := c.ColumnIndexFrom(c.file.reader) + if err != nil { + return nil, err + } + return index, nil +} + +// ColumnIndexFrom is like ColumnIndex but uses the reader passed as argument to +// read the column index. +func (c *FileColumnChunk) ColumnIndexFrom(reader io.ReaderAt) (*FileColumnIndex, error) { + index, err := c.readColumnIndexFrom(reader) if err != nil { return nil, err } if index == nil || c.chunk.ColumnIndexOffset == 0 { return nil, ErrMissingColumnIndex } - return fileColumnIndex{c}, nil + return index, nil +} + +// OffsetIndex returns the offset index of the column chunk, or an error if it +// didn't exist or couldn't be read. +func (c *FileColumnChunk) OffsetIndex() (OffsetIndex, error) { + index, err := c.OffsetIndexFrom(c.file.reader) + if err != nil { + return nil, err + } + return index, nil } -func (c *fileColumnChunk) OffsetIndex() (OffsetIndex, error) { - index, err := c.readOffsetIndex() +// OffsetIndexFrom is like OffsetIndex but uses the reader passed as argument to +// read the offset index. +func (c *FileColumnChunk) OffsetIndexFrom(reader io.ReaderAt) (*FileOffsetIndex, error) { + index, err := c.readOffsetIndex(reader) if err != nil { return nil, err } if index == nil || c.chunk.OffsetIndexOffset == 0 { return nil, ErrMissingOffsetIndex } - return (*fileOffsetIndex)(index), nil + return index, nil } -func (c *fileColumnChunk) BloomFilter() BloomFilter { - if c.bloomFilter == nil { +// BloomFilter returns the bloom filter of the column chunk, or nil if it didn't +// have one. +func (c *FileColumnChunk) BloomFilter() BloomFilter { + filter, err := c.BloomFilterFrom(c.file.reader) + switch err { + case nil: + return filter + case ErrMissingBloomFilter: return nil + default: + return &errorBloomFilter{err: err} + } +} + +// BloomFilterFrom is like BloomFilter but uses the reader passed as argument to +// read the bloom filter. +func (c *FileColumnChunk) BloomFilterFrom(reader io.ReaderAt) (*FileBloomFilter, error) { + filter, err := c.readBloomFilter(reader) + if err != nil { + return nil, err + } + if filter == nil || c.chunk.MetaData.BloomFilterOffset == 0 { + return nil, ErrMissingBloomFilter } - return c.bloomFilter + return filter, nil } -func (c *fileColumnChunk) NumValues() int64 { +// NumValues returns the number of values in the column chunk. +func (c *FileColumnChunk) NumValues() int64 { return c.chunk.MetaData.NumValues } -func (c *fileColumnChunk) readColumnIndex() (*format.ColumnIndex, error) { +// NullCount returns the number of null values in the column chunk. +// +// This value is extracted from the column chunk statistics, parquet writers are +// not required to populate it. +func (c *FileColumnChunk) NullCount() int64 { + return c.chunk.MetaData.Statistics.NullCount +} + +func (c *FileColumnChunk) readColumnIndex() (*FileColumnIndex, error) { + return c.readColumnIndexFrom(c.file.reader) +} + +func (c *FileColumnChunk) readColumnIndexFrom(reader io.ReaderAt) (*FileColumnIndex, error) { if index := c.columnIndex.Load(); index != nil { return index, nil } - chunkMeta := c.file.metadata.RowGroups[c.rowGroup.Ordinal].Columns[c.Column()] - offset, length := chunkMeta.ColumnIndexOffset, chunkMeta.ColumnIndexLength + columnChunk := &c.file.metadata.RowGroups[c.rowGroup.Ordinal].Columns[c.Column()] + offset, length := columnChunk.ColumnIndexOffset, columnChunk.ColumnIndexLength if offset == 0 { return nil, nil } indexData := make([]byte, int(length)) var columnIndex format.ColumnIndex - if _, err := readAt(c.file.reader, indexData, offset); err != nil { + if _, err := readAt(reader, indexData, offset); err != nil { return nil, fmt.Errorf("read %d bytes column index at offset %d: %w", length, offset, err) } if err := thrift.Unmarshal(&c.file.protocol, indexData, &columnIndex); err != nil { return nil, fmt.Errorf("decode column index: rowGroup=%d columnChunk=%d/%d: %w", c.rowGroup.Ordinal, c.Column(), len(c.rowGroup.Columns), err) } - index := &columnIndex + index := &FileColumnIndex{index: &columnIndex, kind: c.column.Type().Kind()} // We do a CAS (and Load on CAS failure) instead of a simple Store for // the nice property that concurrent calling goroutines will only ever // observe a single pointer value for the result. @@ -523,25 +667,25 @@ func (c *fileColumnChunk) readColumnIndex() (*format.ColumnIndex, error) { return index, nil } -func (c *fileColumnChunk) readOffsetIndex() (*format.OffsetIndex, error) { +func (c *FileColumnChunk) readOffsetIndex(reader io.ReaderAt) (*FileOffsetIndex, error) { if index := c.offsetIndex.Load(); index != nil { return index, nil } - chunkMeta := c.file.metadata.RowGroups[c.rowGroup.Ordinal].Columns[c.Column()] - offset, length := chunkMeta.OffsetIndexOffset, chunkMeta.OffsetIndexLength + columnChunk := &c.file.metadata.RowGroups[c.rowGroup.Ordinal].Columns[c.Column()] + offset, length := columnChunk.OffsetIndexOffset, columnChunk.OffsetIndexLength if offset == 0 { return nil, nil } indexData := make([]byte, int(length)) var offsetIndex format.OffsetIndex - if _, err := readAt(c.file.reader, indexData, offset); err != nil { + if _, err := readAt(reader, indexData, offset); err != nil { return nil, fmt.Errorf("read %d bytes offset index at offset %d: %w", length, offset, err) } if err := thrift.Unmarshal(&c.file.protocol, indexData, &offsetIndex); err != nil { return nil, fmt.Errorf("decode offset index: rowGroup=%d columnChunk=%d/%d: %w", c.rowGroup.Ordinal, c.Column(), len(c.rowGroup.Columns), err) } - index := &offsetIndex + index := &FileOffsetIndex{index: &offsetIndex} if !c.offsetIndex.CompareAndSwap(nil, index) { // another goroutine populated it since we last read the pointer return c.offsetIndex.Load(), nil @@ -549,8 +693,40 @@ func (c *fileColumnChunk) readOffsetIndex() (*format.OffsetIndex, error) { return index, nil } -type filePages struct { - chunk *fileColumnChunk +func (c *FileColumnChunk) readBloomFilter(reader io.ReaderAt) (*FileBloomFilter, error) { + if filter := c.bloomFilter.Load(); filter != nil { + return filter, nil + } + columnChunkMetaData := &c.file.metadata.RowGroups[c.rowGroup.Ordinal].Columns[c.Column()].MetaData + offset := columnChunkMetaData.BloomFilterOffset + length := c.file.size - offset + if offset == 0 { + return nil, nil + } + + section := io.NewSectionReader(reader, offset, length) + rbuf, rbufpool := getBufioReader(section, 1024) + defer putBufioReader(rbuf, rbufpool) + + header := format.BloomFilterHeader{} + compact := thrift.CompactProtocol{} + decoder := thrift.NewDecoder(compact.NewReader(rbuf)) + + if err := decoder.Decode(&header); err != nil { + return nil, fmt.Errorf("decoding bloom filter header: %w", err) + } + + offset, _ = section.Seek(0, io.SeekCurrent) + filter := newBloomFilter(reader, offset, &header) + + if !c.bloomFilter.CompareAndSwap(nil, filter) { + return c.bloomFilter.Load(), nil + } + return filter, nil +} + +type FilePages struct { + chunk *FileColumnChunk rbuf *bufio.Reader rbufpool *sync.Pool section io.SectionReader @@ -568,7 +744,7 @@ type filePages struct { bufferSize int } -func (f *filePages) init(c *fileColumnChunk) { +func (f *FilePages) init(c *FileColumnChunk, reader io.ReaderAt) { f.chunk = c f.baseOffset = c.chunk.MetaData.DataPageOffset f.dataOffset = f.baseOffset @@ -579,12 +755,28 @@ func (f *filePages) init(c *fileColumnChunk) { f.dictOffset = f.baseOffset } - f.section = *io.NewSectionReader(c.file, f.baseOffset, c.chunk.MetaData.TotalCompressedSize) + f.section = *io.NewSectionReader(reader, f.baseOffset, c.chunk.MetaData.TotalCompressedSize) f.rbuf, f.rbufpool = getBufioReader(&f.section, f.bufferSize) f.decoder.Reset(f.protocol.NewReader(f.rbuf)) } -func (f *filePages) ReadPage() (Page, error) { +// ReadDictionary returns the dictionary of the column chunk, or nil if the +// column chunk did not have one. +// +// The program is not required to call this method before calling ReadPage, +// the dictionary is read automatically when needed. It is exposed to allow +// programs to access the dictionary without reading the first page. +func (f *FilePages) ReadDictionary() (Dictionary, error) { + if f.dictionary == nil && f.dictOffset > 0 { + if err := f.readDictionary(); err != nil { + return nil, err + } + } + return f.dictionary, nil +} + +// ReadPages reads the next from from f. +func (f *FilePages) ReadPage() (Page, error) { if f.chunk == nil { return nil, io.EOF } @@ -658,8 +850,8 @@ func (f *filePages) ReadPage() (Page, error) { } } -func (f *filePages) readDictionary() error { - chunk := io.NewSectionReader(f.chunk.file, f.baseOffset, f.chunk.chunk.MetaData.TotalCompressedSize) +func (f *FilePages) readDictionary() error { + chunk := io.NewSectionReader(f.section.Outer()) rbuf, pool := getBufioReader(chunk, f.bufferSize) defer putBufioReader(rbuf, pool) @@ -681,7 +873,7 @@ func (f *filePages) readDictionary() error { return f.readDictionaryPage(header, page) } -func (f *filePages) readDictionaryPage(header *format.PageHeader, page *buffer) error { +func (f *FilePages) readDictionaryPage(header *format.PageHeader, page *buffer) error { if header.DictionaryPageHeader == nil { return ErrMissingPageHeader } @@ -693,7 +885,7 @@ func (f *filePages) readDictionaryPage(header *format.PageHeader, page *buffer) return nil } -func (f *filePages) readDataPageV1(header *format.PageHeader, page *buffer) (Page, error) { +func (f *FilePages) readDataPageV1(header *format.PageHeader, page *buffer) (Page, error) { if header.DataPageHeader == nil { return nil, ErrMissingPageHeader } @@ -705,7 +897,7 @@ func (f *filePages) readDataPageV1(header *format.PageHeader, page *buffer) (Pag return f.chunk.column.decodeDataPageV1(DataPageHeaderV1{header.DataPageHeader}, page, f.dictionary, header.UncompressedPageSize) } -func (f *filePages) readDataPageV2(header *format.PageHeader, page *buffer) (Page, error) { +func (f *FilePages) readDataPageV2(header *format.PageHeader, page *buffer) (Page, error) { if header.DataPageHeaderV2 == nil { return nil, ErrMissingPageHeader } @@ -720,7 +912,7 @@ func (f *filePages) readDataPageV2(header *format.PageHeader, page *buffer) (Pag return f.chunk.column.decodeDataPageV2(DataPageHeaderV2{header.DataPageHeaderV2}, page, f.dictionary, header.UncompressedPageSize) } -func (f *filePages) readPage(header *format.PageHeader, reader *bufio.Reader) (*buffer, error) { +func (f *FilePages) readPage(header *format.PageHeader, reader *bufio.Reader) (*buffer, error) { page := buffers.get(int(header.CompressedPageSize)) defer page.unref() @@ -754,7 +946,8 @@ func (f *filePages) readPage(header *format.PageHeader, reader *bufio.Reader) (* return page, nil } -func (f *filePages) SeekToRow(rowIndex int64) (err error) { +// SeekToRow seeks to the given row index in the column chunk. +func (f *FilePages) SeekToRow(rowIndex int64) (err error) { if f.chunk == nil { return io.ErrClosedPipe } @@ -766,7 +959,7 @@ func (f *filePages) SeekToRow(rowIndex int64) (err error) { f.index = 1 } } else { - pages := index.PageLocations + pages := index.index.PageLocations index := sort.Search(len(pages), func(i int) bool { return pages[i].FirstRowIndex > rowIndex }) - 1 @@ -781,7 +974,8 @@ func (f *filePages) SeekToRow(rowIndex int64) (err error) { return err } -func (f *filePages) Close() error { +// Close closes the page reader. +func (f *FilePages) Close() error { putBufioReader(f.rbuf, f.rbufpool) f.chunk = nil f.section = io.SectionReader{} @@ -796,7 +990,7 @@ func (f *filePages) Close() error { return nil } -func (f *filePages) columnPath() columnPath { +func (f *FilePages) columnPath() columnPath { return columnPath(f.chunk.column.Path()) } @@ -852,3 +1046,29 @@ func readAt(r io.ReaderAt, p []byte, off int64) (n int, err error) { } return } + +type optimisticFileReaderAt struct { + reader io.ReaderAt + offset int64 + footer []byte +} + +func (r *optimisticFileReaderAt) ReadAt(p []byte, off int64) (n int, err error) { + length := r.offset + int64(len(r.footer)) + + if off >= length { + return 0, io.EOF + } + + if off >= r.offset { + n = copy(p, r.footer[off-r.offset:]) + p = p[n:] + off += int64(n) + if len(p) == 0 { + return n, nil + } + } + + rn, err := r.reader.ReadAt(p, off) + return n + rn, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/format/parquet.go b/vendor/github.com/parquet-go/parquet-go/format/parquet.go index 7939e6a0964fd..5366c919cdd80 100644 --- a/vendor/github.com/parquet-go/parquet-go/format/parquet.go +++ b/vendor/github.com/parquet-go/parquet-go/format/parquet.go @@ -70,6 +70,67 @@ func (t FieldRepetitionType) String() string { } } +// A structure for capturing metadata for estimating the unencoded, +// uncompressed size of data written. This is useful for readers to estimate +// how much memory is needed to reconstruct data in their memory model and for +// fine grained filter pushdown on nested structures (the histograms contained +// in this structure can help determine the number of nulls at a particular +// nesting level and maximum length of lists). +type SizeStatistics struct { + // The number of physical bytes stored for BYTE_ARRAY data values assuming + // no encoding. This is exclusive of the bytes needed to store the length of + // each byte array. In other words, this field is equivalent to the `(size + // of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + // written)`. To determine unencoded sizes of other types readers can use + // schema information multiplied by the number of non-null and null values. + // The number of null/non-null values can be inferred from the histograms + // below. + // + // For example, if a column chunk is dictionary-encoded with dictionary + // ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + // then this value for that data page should be 7 (1 + 1 + 2 + 3). + // + // This field should only be set for types that use BYTE_ARRAY as their + // physical type. + UnencodedByteArrayDataBytes *int64 `thrift:"1,optional"` + + // When present, there is expected to be one element corresponding to each + // repetition (i.e. size=max repetition_level+1) where each element + // represents the number of times the repetition level was observed in the + // data. + // + // This field may be omitted if max_repetition_level is 0 without loss + // of information. + RepetitionLevelHistogram []int64 `thrift:"2,optional"` + + // Same as repetition_level_histogram except for definition levels. + // + // This field may be omitted if max_definition_level is 0 or 1 without + // loss of information. + DefinitionLevelHistogram []int64 `thrift:"3,optional"` +} + +// Bounding box for GEOMETRY or GEOGRAPHY type in the representation of min/max +// value pair of coordinates from each axis. +type BoundingBox struct { + XMin float64 `thrift:"1,required"` + XMax float64 `thrift:"2,required"` + YMin float64 `thrift:"3,required"` + YMax float64 `thrift:"4,required"` + ZMin *float64 `thrift:"5,optional"` + ZMax *float64 `thrift:"6,optional"` + MMin *float64 `thrift:"7,optional"` + MMax *float64 `thrift:"8,optional"` +} + +// Statistics specific to Geometry and Geography logical types +type GeospatialStatistics struct { + // A bounding box of geospatial instances + BBox *BoundingBox `thrift:"1,optional"` + // Geospatial type codes of all instances, or an empty list if not known + GeoSpatialTypes []int32 `thrift:"2,optional"` +} + // Statistics per row group and per page. // All fields are optional. type Statistics struct { @@ -99,19 +160,21 @@ type Statistics struct { } // Empty structs to use as logical type annotations. -type StringType struct{} // allowed for BINARY, must be encoded with UTF-8 -type UUIDType struct{} // allowed for FIXED[16], must encode raw UUID bytes -type MapType struct{} // see see LogicalTypes.md -type ListType struct{} // see LogicalTypes.md -type EnumType struct{} // allowed for BINARY, must be encoded with UTF-8 -type DateType struct{} // allowed for INT32 - -func (*StringType) String() string { return "STRING" } -func (*UUIDType) String() string { return "UUID" } -func (*MapType) String() string { return "MAP" } -func (*ListType) String() string { return "LIST" } -func (*EnumType) String() string { return "ENUM" } -func (*DateType) String() string { return "DATE" } +type StringType struct{} // allowed for BINARY, must be encoded with UTF-8 +type UUIDType struct{} // allowed for FIXED[16], must encode raw UUID bytes +type MapType struct{} // see LogicalTypes.md +type ListType struct{} // see LogicalTypes.md +type EnumType struct{} // allowed for BINARY, must be encoded with UTF-8 +type DateType struct{} // allowed for INT32 +type Float16Type struct{} // allowed for FIXED[2], must encoded raw FLOAT16 bytes + +func (*StringType) String() string { return "STRING" } +func (*UUIDType) String() string { return "UUID" } +func (*MapType) String() string { return "MAP" } +func (*ListType) String() string { return "LIST" } +func (*EnumType) String() string { return "ENUM" } +func (*DateType) String() string { return "DATE" } +func (*Float16Type) String() string { return "FLOAT16" } // Logical type to annotate a column that is always null. // @@ -218,6 +281,91 @@ type BsonType struct{} func (t *BsonType) String() string { return "BSON" } +// Embedded Variant logical type annotation +type VariantType struct{} + +func (*VariantType) String() string { return "VARIANT" } + +// Edge interpolation algorithm for Geography logical type +type EdgeInterpolationAlgorithm int32 + +const ( + Spherical EdgeInterpolationAlgorithm = 0 + Vincenty EdgeInterpolationAlgorithm = 1 + Thomas EdgeInterpolationAlgorithm = 2 + Andoyer EdgeInterpolationAlgorithm = 3 + Karney EdgeInterpolationAlgorithm = 4 +) + +func (e EdgeInterpolationAlgorithm) String() string { + switch e { + case Spherical: + return "SPHERICAL" + case Vincenty: + return "VINCENTY" + case Thomas: + return "THOMAS" + case Andoyer: + return "ANDOYER" + case Karney: + return "KARNEY" + default: + return "EdgeInterpolationAlgorithm(?)" + } +} + +// Embedded Geometry logical type annotation +// +// Geospatial features in the Well-Known Binary (WKB) format and edges interpolation +// is always linear/planar. +// +// A custom CRS can be set by the crs field. If unset, it defaults to "OGC:CRS84", +// which means that the geometries must be stored in longitude, latitude based on +// the WGS84 datum. +// +// Allowed for physical type: BYTE_ARRAY. +// +// See Geospatial.md for details. +type GeometryType struct { + CRS string `thrift:"1,optional"` +} + +func (t *GeometryType) String() string { + crs := t.CRS + if crs == "" { + crs = "OGC:CRS84" + } + return fmt.Sprintf("GEOMETRY(%q)", crs) +} + +// Embedded Geography logical type annotation +// +// Geospatial features in the WKB format with an explicit (non-linear/non-planar) +// edges interpolation algorithm. +// +// A custom geographic CRS can be set by the crs field, where longitudes are +// bound by [-180, 180] and latitudes are bound by [-90, 90]. If unset, the CRS +// defaults to "OGC:CRS84". +// +// An optional algorithm can be set to correctly interpret edges interpolation +// of the geometries. If unset, the algorithm defaults to SPHERICAL. +// +// Allowed for physical type: BYTE_ARRAY. +// +// See Geospatial.md for details. +type GeographyType struct { + CRS string `thrift:"1,optional"` + Algorithm EdgeInterpolationAlgorithm `thrift:"2,optional"` +} + +func (t *GeographyType) String() string { + crs := t.CRS + if crs == "" { + crs = "OGC:CRS84" + } + return fmt.Sprintf("GEOGRAPHY(%q, %s)", crs, t.Algorithm) +} + // LogicalType annotations to replace ConvertedType. // // To maintain compatibility, implementations using LogicalType for a @@ -240,11 +388,15 @@ type LogicalType struct { // union Timestamp *TimestampType `thrift:"8"` // 9: reserved for Interval - Integer *IntType `thrift:"10"` // use ConvertedType Int* or Uint* - Unknown *NullType `thrift:"11"` // no compatible ConvertedType - Json *JsonType `thrift:"12"` // use ConvertedType JSON - Bson *BsonType `thrift:"13"` // use ConvertedType BSON - UUID *UUIDType `thrift:"14"` // no compatible ConvertedType + Integer *IntType `thrift:"10"` // use ConvertedType Int* or Uint* + Unknown *NullType `thrift:"11"` // no compatible ConvertedType + Json *JsonType `thrift:"12"` // use ConvertedType JSON + Bson *BsonType `thrift:"13"` // use ConvertedType BSON + UUID *UUIDType `thrift:"14"` // no compatible ConvertedType + Float16 *Float16Type `thrift:"15"` // no compatible ConvertedType + Variant *VariantType `thrift:"16"` // no compatible ConvertedType + Geometry *GeometryType `thrift:"17"` // no compatible ConvertedType + Geography *GeographyType `thrift:"18"` // no compatible ConvertedType } func (t *LogicalType) String() string { @@ -275,6 +427,14 @@ func (t *LogicalType) String() string { return t.Bson.String() case t.UUID != nil: return t.UUID.String() + case t.Float16 != nil: + return t.Float16.String() + case t.Variant != nil: + return t.Variant.String() + case t.Geometry != nil: + return t.Geometry.String() + case t.Geography != nil: + return t.Geography.String() default: return "" } @@ -743,6 +903,29 @@ type ColumnMetaData struct { // Byte offset from beginning of file to Bloom filter data. BloomFilterOffset int64 `thrift:"14,optional"` + + // Size of Bloom filter data including the serialized header, in bytes. + // Added in 2.10 so readers may not read this field from old files and + // it can be obtained after the BloomFilterHeader has been deserialized. + // Writers should write this field so readers can read the bloom filter + // in a single I/O. + BloomFilterLength *int32 `thrift:"15,optional"` + + // Optional statistics to help estimate total memory when converted to in-memory + // representations. The histograms contained in these statistics can + // also be useful in some cases for more fine-grained nullability/list length + // filter pushdown. + // TODO: Uncomment this field when Thrift decoding is fixed. Strangely, when it is + // uncommented, test cases in file_test.go fail with an inexplicable error decoding + // an unrelated field: + // reading parquet file metadata: decoding thrift payload: 4:FIELD → 0/1:LIST: missing required field: 2:FIELD + // (Seems to be complaining about field TotalBytesSize of RowGroup). This only occurs + // with testdata/dict-page-offset-zero.parquet, in both TestOpenFileWithoutPageIndex + // and TestOpenFile. + //SizeStatistics *SizeStatistics `thrift:"16,optional"` + + // Optional statistics specific for Geometry and Geography logical types + GeospatialStatistics *GeospatialStatistics `thrift:"17,optional"` } type EncryptionWithFooterKey struct{} @@ -856,6 +1039,9 @@ type ColumnOrder struct { // union // ENUM - unsigned byte-wise comparison // LIST - undefined // MAP - undefined + // VARIANT - undefined + // GEOMETRY - undefined + // GEOGRAPHY - undefined // // In the absence of logical types, the sort order is determined by the physical type: // BOOLEAN - false, true @@ -895,6 +1081,12 @@ type OffsetIndex struct { // PageLocations, ordered by increasing PageLocation.offset. It is required // that page_locations[i].first_row_index < page_locations[i+1].first_row_index. PageLocations []PageLocation `thrift:"1,required"` + + // Unencoded/uncompressed size for BYTE_ARRAY types. + // + // See documention for unencoded_byte_array_data_bytes in SizeStatistics for + // more details on this field. + UnencodedByteArrayDataBytes []int64 `thrift:"2,optional"` } // Description for ColumnIndex. @@ -926,6 +1118,21 @@ type ColumnIndex struct { // A list containing the number of null values for each page. NullCounts []int64 `thrift:"5,optional"` + + // Contains repetition level histograms for each page + // concatenated together. The repetition_level_histogram field on + // SizeStatistics contains more details. + // + // When present the length should always be (number of pages * + // (max_repetition_level + 1)) elements. + // + // Element 0 is the first element of the histogram for the first page. + // Element (max_repetition_level + 1) is the first element of the histogram + // for the second page. + RepetitionLevelHistogram []int64 `thrift:"6,optional"` + + // Same as repetition_level_histograms except for definitions levels. + DefinitionLevelHistogram []int64 `thrift:"7,optional"` } type AesGcmV1 struct { diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bytealg/count_amd64.s b/vendor/github.com/parquet-go/parquet-go/internal/bytealg/count_amd64.s index d429614098b68..1a75a0f798aa6 100644 --- a/vendor/github.com/parquet-go/parquet-go/internal/bytealg/count_amd64.s +++ b/vendor/github.com/parquet-go/parquet-go/internal/bytealg/count_amd64.s @@ -20,7 +20,7 @@ TEXT ·Count(SB), NOSPLIT, $0-40 XORQ R12, R12 XORQ R13, R13 XORQ R14, R14 - XORQ R15, R15 + XORQ DI, DI CMPB ·hasAVX512Count(SB), $0 JE initAVX2 @@ -49,14 +49,14 @@ loopAVX512: ADDQ R8, R12 ADDQ R9, R13 ADDQ R10, R14 - ADDQ R11, R15 + ADDQ R11, DI ADDQ $256, AX CMPQ AX, DX JNE loopAVX512 ADDQ R12, R13 - ADDQ R14, R15 + ADDQ R14, DI ADDQ R13, SI - ADDQ R15, SI + ADDQ DI, SI JMP doneAVX initAVX2: @@ -74,12 +74,12 @@ loopAVX2: POPCNTL R12, R12 POPCNTL R13, R13 ADDQ R12, R14 - ADDQ R13, R15 + ADDQ R13, DI ADDQ $64, AX CMPQ AX, DX JNE loopAVX2 ADDQ R14, SI - ADDQ R15, SI + ADDQ DI, SI doneAVX: VZEROUPPER diff --git a/vendor/github.com/parquet-go/parquet-go/merge.go b/vendor/github.com/parquet-go/parquet-go/merge.go index c45cd7103d3e7..6f7a8c4af93a2 100644 --- a/vendor/github.com/parquet-go/parquet-go/merge.go +++ b/vendor/github.com/parquet-go/parquet-go/merge.go @@ -33,7 +33,7 @@ func MergeRowGroups(rowGroups []RowGroup, options ...RowGroupOption) (RowGroup, schema = rowGroups[0].Schema() for _, rowGroup := range rowGroups[1:] { - if !nodesAreEqual(schema, rowGroup.Schema()) { + if !EqualNodes(schema, rowGroup.Schema()) { return nil, ErrRowGroupSchemaMismatch } } @@ -43,7 +43,7 @@ func MergeRowGroups(rowGroups []RowGroup, options ...RowGroupOption) (RowGroup, copy(mergedRowGroups, rowGroups) for i, rowGroup := range mergedRowGroups { - if rowGroupSchema := rowGroup.Schema(); !nodesAreEqual(schema, rowGroupSchema) { + if rowGroupSchema := rowGroup.Schema(); !EqualNodes(schema, rowGroupSchema) { conv, err := Convert(schema, rowGroupSchema) if err != nil { return nil, fmt.Errorf("cannot merge row groups: %w", err) diff --git a/vendor/github.com/parquet-go/parquet-go/multi_row_group.go b/vendor/github.com/parquet-go/parquet-go/multi_row_group.go index b68c8782a670b..d37767fd56ec3 100644 --- a/vendor/github.com/parquet-go/parquet-go/multi_row_group.go +++ b/vendor/github.com/parquet-go/parquet-go/multi_row_group.go @@ -7,10 +7,6 @@ import ( // MultiRowGroup wraps multiple row groups to appear as if it was a single // RowGroup. RowGroups must have the same schema or it will error. func MultiRowGroup(rowGroups ...RowGroup) RowGroup { - return newMultiRowGroup(ReadModeSync, rowGroups...) -} - -func newMultiRowGroup(pageReadMode ReadMode, rowGroups ...RowGroup) RowGroup { if len(rowGroups) == 0 { return &emptyRowGroup{} } @@ -26,9 +22,7 @@ func newMultiRowGroup(pageReadMode ReadMode, rowGroups ...RowGroup) RowGroup { rowGroupsCopy := make([]RowGroup, len(rowGroups)) copy(rowGroupsCopy, rowGroups) - c := &multiRowGroup{ - pageReadMode: pageReadMode, - } + c := new(multiRowGroup) c.init(schema, rowGroupsCopy) return c } @@ -81,7 +75,7 @@ func compatibleSchemaOf(rowGroups []RowGroup) (*Schema, error) { // Slow path: The schema pointers are not the same, but they still have to // be compatible. for _, rowGroup := range rowGroups[1:] { - if !nodesAreEqual(schema, rowGroup.Schema()) { + if !EqualNodes(schema, rowGroup.Schema()) { return nil, ErrRowGroupSchemaMismatch } } @@ -90,10 +84,9 @@ func compatibleSchemaOf(rowGroups []RowGroup) (*Schema, error) { } type multiRowGroup struct { - schema *Schema - rowGroups []RowGroup - columns []ColumnChunk - pageReadMode ReadMode + schema *Schema + rowGroups []RowGroup + columns []ColumnChunk } func (c *multiRowGroup) NumRows() (numRows int64) { @@ -109,7 +102,7 @@ func (c *multiRowGroup) SortingColumns() []SortingColumn { return nil } func (c *multiRowGroup) Schema() *Schema { return c.schema } -func (c *multiRowGroup) Rows() Rows { return newRowGroupRows(c, c.pageReadMode) } +func (c *multiRowGroup) Rows() Rows { return NewRowGroupRowReader(c) } type multiColumnChunk struct { rowGroup *multiRowGroup diff --git a/vendor/github.com/parquet-go/parquet-go/node.go b/vendor/github.com/parquet-go/parquet-go/node.go index b3540b09e06f9..25603a11f4758 100644 --- a/vendor/github.com/parquet-go/parquet-go/node.go +++ b/vendor/github.com/parquet-go/parquet-go/node.go @@ -2,7 +2,8 @@ package parquet import ( "reflect" - "sort" + "slices" + "strings" "unicode" "unicode/utf8" @@ -288,8 +289,8 @@ func (g Group) Fields() []Field { name: name, }) } - sort.Slice(groupFields, func(i, j int) bool { - return groupFields[i].name < groupFields[j].name + slices.SortFunc(groupFields, func(a, b groupField) int { + return strings.Compare(a.name, b.name) }) fields := make([]Field, len(groupFields)) for i := range groupFields { @@ -486,7 +487,15 @@ func fieldByName(node Node, name string) Field { return nil } -func nodesAreEqual(node1, node2 Node) bool { +// EqualNodes returns true if node1 and node2 are equal. +// +// Nodes that are not of the same repetition type (optional, required, repeated) or +// of the same hierarchical type (leaf, group) are considered not equal. +// Leaf nodes are considered equal if they are of the same data type. +// Group nodes are considered equal if their fields have the same names and are recursively equal. +// +// Note that the encoding and compression of the nodes are not considered by this function. +func EqualNodes(node1, node2 Node) bool { if node1.Leaf() { return node2.Leaf() && leafNodesAreEqual(node1, node2) } else { @@ -528,7 +537,7 @@ func groupNodesAreEqual(node1, node2 Node) bool { return false } - if !nodesAreEqual(f1, f2) { + if !EqualNodes(f1, f2) { return false } } diff --git a/vendor/github.com/parquet-go/parquet-go/offset_index.go b/vendor/github.com/parquet-go/parquet-go/offset_index.go index 49b9041ab55e6..b499e427cd493 100644 --- a/vendor/github.com/parquet-go/parquet-go/offset_index.go +++ b/vendor/github.com/parquet-go/parquet-go/offset_index.go @@ -23,22 +23,24 @@ type OffsetIndex interface { FirstRowIndex(int) int64 } -type fileOffsetIndex format.OffsetIndex +type FileOffsetIndex struct { + index *format.OffsetIndex +} -func (i *fileOffsetIndex) NumPages() int { - return len(i.PageLocations) +func (i *FileOffsetIndex) NumPages() int { + return len(i.index.PageLocations) } -func (i *fileOffsetIndex) Offset(j int) int64 { - return i.PageLocations[j].Offset +func (i *FileOffsetIndex) Offset(j int) int64 { + return i.index.PageLocations[j].Offset } -func (i *fileOffsetIndex) CompressedPageSize(j int) int64 { - return int64(i.PageLocations[j].CompressedPageSize) +func (i *FileOffsetIndex) CompressedPageSize(j int) int64 { + return int64(i.index.PageLocations[j].CompressedPageSize) } -func (i *fileOffsetIndex) FirstRowIndex(j int) int64 { - return i.PageLocations[j].FirstRowIndex +func (i *FileOffsetIndex) FirstRowIndex(j int) int64 { + return i.index.PageLocations[j].FirstRowIndex } type emptyOffsetIndex struct{} diff --git a/vendor/github.com/parquet-go/parquet-go/page.go b/vendor/github.com/parquet-go/parquet-go/page.go index 0c3ef0329a898..9dc60b247d3f5 100644 --- a/vendor/github.com/parquet-go/parquet-go/page.go +++ b/vendor/github.com/parquet-go/parquet-go/page.go @@ -60,6 +60,14 @@ type Page interface { // like parquet.Int32Reader. Applications should use type assertions on // the returned reader to determine whether those optimizations are // available. + // + // In the data page format version 1, it wasn't specified whether pages + // must start with a new row. Legacy writers have produced parquet files + // where row values were overlapping between two consecutive pages. + // As a result, the values read must not be assumed to start at the + // beginning of a row, unless the program knows that it is only working + // with parquet files that used the data page format version 2 (which is + // the default behavior for parquet-go). Values() ValueReader // Returns a new page which is as slice of the receiver between row indexes @@ -113,8 +121,20 @@ type Pages interface { // be reading pages from a high latency backend, and the last // page read may be processed while initiating reading of the next page. func AsyncPages(pages Pages) Pages { - p := new(asyncPages) - p.init(pages, nil) + read := make(chan asyncPage) + seek := make(chan asyncSeek, 1) + init := make(chan struct{}) + done := make(chan struct{}) + + go readPages(pages, read, seek, init, done) + + p := &asyncPages{ + read: read, + seek: seek, + init: init, + done: done, + } + // If the pages object gets garbage collected without Close being called, // this finalizer would ensure that the goroutine is stopped and doesn't // leak. @@ -123,9 +143,10 @@ func AsyncPages(pages Pages) Pages { } type asyncPages struct { - read <-chan asyncPage - seek chan<- int64 - done chan<- struct{} + read chan asyncPage + seek chan asyncSeek + init chan struct{} + done chan struct{} version int64 } @@ -135,22 +156,16 @@ type asyncPage struct { version int64 } -func (pages *asyncPages) init(base Pages, done chan struct{}) { - read := make(chan asyncPage) - seek := make(chan int64, 1) - - pages.read = read - pages.seek = seek - - if done == nil { - done = make(chan struct{}) - pages.done = done - } - - go readPages(base, read, seek, done) +type asyncSeek struct { + rowIndex int64 + version int64 } func (pages *asyncPages) Close() (err error) { + if pages.init != nil { + close(pages.init) + pages.init = nil + } if pages.done != nil { close(pages.done) pages.done = nil @@ -165,6 +180,7 @@ func (pages *asyncPages) Close() (err error) { } func (pages *asyncPages) ReadPage() (Page, error) { + pages.start() for { p, ok := <-pages.read if !ok { @@ -187,42 +203,83 @@ func (pages *asyncPages) SeekToRow(rowIndex int64) error { if pages.seek == nil { return io.ErrClosedPipe } + // First flush the channel in case SeekToRow is called twice or more in a + // row, otherwise we would block if readPages had already exited. + select { + case <-pages.seek: + default: + pages.version++ + } // The seek channel has a capacity of 1 to allow the first SeekToRow call to // be non-blocking. // // If SeekToRow calls are performed faster than they can be handled by the // goroutine reading pages, this path might become a contention point. - pages.seek <- rowIndex - pages.version++ + pages.seek <- asyncSeek{rowIndex: rowIndex, version: pages.version} + pages.start() return nil } -func readPages(pages Pages, read chan<- asyncPage, seek <-chan int64, done <-chan struct{}) { +func (pages *asyncPages) start() { + if pages.init != nil { + close(pages.init) + pages.init = nil + } +} + +func readPages(pages Pages, read chan<- asyncPage, seek <-chan asyncSeek, init, done <-chan struct{}) { defer func() { read <- asyncPage{err: pages.Close(), version: -1} close(read) }() - version := int64(0) + // To avoid reading pages before the first SeekToRow call, we wait for the + // reader to be initialized, which means it either received a call to + // ReadPage, SeekToRow, or Close. + select { + case <-init: + case <-done: + return + } + + // If SeekToRow was invoked before ReadPage, the seek channel contains the + // new position of the reader. + // + // Note that we have a default case in this select because we don't want to + // block if the first call was ReadPage and no values were ever produced to + // the seek channel. + var seekTo asyncSeek + select { + case seekTo = <-seek: + default: + seekTo.rowIndex = -1 + } + for { - page, err := pages.ReadPage() - - for { - select { - case <-done: - return - case read <- asyncPage{ - page: page, - err: err, - version: version, - }: - case rowIndex := <-seek: - version++ - err = pages.SeekToRow(rowIndex) - } + var page Page + var err error + + if seekTo.rowIndex >= 0 { + err = pages.SeekToRow(seekTo.rowIndex) if err == nil { - break + seekTo.rowIndex = -1 + continue } + } else { + page, err = pages.ReadPage() + } + + select { + case read <- asyncPage{ + page: page, + err: err, + version: seekTo.version, + }: + case seekTo = <-seek: + Release(page) + case <-done: + Release(page) + return } } } diff --git a/vendor/github.com/parquet-go/parquet-go/reader.go b/vendor/github.com/parquet-go/parquet-go/reader.go index 2f444bb739b83..7be276855dd0f 100644 --- a/vendor/github.com/parquet-go/parquet-go/reader.go +++ b/vendor/github.com/parquet-go/parquet-go/reader.go @@ -5,6 +5,8 @@ import ( "fmt" "io" "reflect" + + "github.com/parquet-go/parquet-go/format" ) // GenericReader is similar to a Reader but uses a type parameter to define the @@ -51,13 +53,14 @@ func NewGenericReader[T any](input io.ReaderAt, options ...ReaderOption) *Generi r := &GenericReader[T]{ base: Reader{ file: reader{ + file: f, schema: c.Schema, rowGroup: rowGroup, }, }, } - if !nodesAreEqual(c.Schema, f.schema) { + if !EqualNodes(c.Schema, f.schema) { r.base.file.rowGroup = convertRowGroupTo(r.base.file.rowGroup, c.Schema) } @@ -90,7 +93,7 @@ func NewGenericRowGroupReader[T any](rowGroup RowGroup, options ...ReaderOption) }, } - if !nodesAreEqual(c.Schema, rowGroup.Schema()) { + if !EqualNodes(c.Schema, rowGroup.Schema()) { r.base.file.rowGroup = convertRowGroupTo(r.base.file.rowGroup, c.Schema) } @@ -134,6 +137,11 @@ func (r *GenericReader[T]) Close() error { return r.base.Close() } +// File returns a FileView of the underlying parquet file. +func (r *GenericReader[T]) File() FileView { + return r.base.File() +} + // readRows reads the next rows from the reader into the given rows slice up to len(rows). // // The returned values are safe to reuse across readRows calls and do not share @@ -274,6 +282,7 @@ func NewReader(input io.ReaderAt, options ...ReaderOption) *Reader { r := &Reader{ file: reader{ + file: f, schema: f.schema, rowGroup: fileRowGroupOf(f), }, @@ -309,7 +318,7 @@ func fileRowGroupOf(f *File) RowGroup { default: // TODO: should we attempt to merge the row groups via MergeRowGroups // to preserve the global order of sorting columns within the file? - return newMultiRowGroup(f.config.ReadMode, rowGroups...) + return MultiRowGroup(rowGroups...) } } @@ -337,7 +346,7 @@ func NewRowGroupReader(rowGroup RowGroup, options ...ReaderOption) *Reader { } func convertRowGroupTo(rowGroup RowGroup, schema *Schema) RowGroup { - if rowGroupSchema := rowGroup.Schema(); !nodesAreEqual(schema, rowGroupSchema) { + if rowGroupSchema := rowGroup.Schema(); !EqualNodes(schema, rowGroupSchema) { conv, err := Convert(schema, rowGroupSchema) if err != nil { // TODO: this looks like something we should not be panicking on, @@ -416,7 +425,7 @@ func (r *Reader) Read(row interface{}) error { func (r *Reader) updateReadSchema(rowType reflect.Type) error { schema := schemaOf(rowType) - if nodesAreEqual(schema, r.file.schema) { + if EqualNodes(schema, r.file.schema) { r.read.init(schema, r.file.rowGroup) } else { conv, err := Convert(schema, r.file.schema) @@ -477,6 +486,7 @@ func (r *Reader) Close() error { // read rows into Go values, potentially doing partial reads on a subset of the // columns due to using a converted row group view. type reader struct { + file *File schema *Schema rowGroup RowGroup rows Rows @@ -558,3 +568,104 @@ var ( _ RowReader = (*reader)(nil) _ RowSeeker = (*reader)(nil) ) + +type readerFileView struct { + reader *reader + schema *Schema +} + +// File returns a FileView of the parquet file being read. +// Only available if Reader was created with a File. +func (r *Reader) File() FileView { + if r.file.schema == nil || r.file.file == nil { + return nil + } + return &readerFileView{ + &r.file, + r.file.schema, + } +} + +func (r *readerFileView) Metadata() *format.FileMetaData { + if r.reader.file != nil { + return r.reader.file.Metadata() + } + return nil +} + +func (r *readerFileView) Schema() *Schema { + return r.schema +} + +func (r *readerFileView) NumRows() int64 { + return r.reader.rowGroup.NumRows() +} + +func (r *readerFileView) Lookup(key string) (string, bool) { + if meta := r.Metadata(); meta != nil { + return lookupKeyValueMetadata(meta.KeyValueMetadata, key) + } + return "", false +} + +func (r *readerFileView) Size() int64 { + if r.reader.file != nil { + return r.reader.file.Size() + } + return 0 +} + +func (r *readerFileView) ColumnIndexes() []format.ColumnIndex { + if r.reader.file != nil { + return r.reader.file.ColumnIndexes() + } + return nil +} + +func (r *readerFileView) OffsetIndexes() []format.OffsetIndex { + if r.reader.file != nil { + return r.reader.file.OffsetIndexes() + } + return nil +} + +func (r *readerFileView) Root() *Column { + if meta := r.Metadata(); meta != nil { + root, _ := openColumns(nil, meta, r.ColumnIndexes(), r.OffsetIndexes()) + return root + } + return nil +} + +func (r *readerFileView) RowGroups() []RowGroup { + file := r.reader.file + if file == nil { + return nil + } + columns := makeLeafColumns(r.Root()) + fileRowGroups := makeFileRowGroups(file, columns) + return makeRowGroups(fileRowGroups) +} + +func makeLeafColumns(root *Column) []*Column { + columns := make([]*Column, 0, numLeafColumnsOf(root)) + root.forEachLeaf(func(c *Column) { columns = append(columns, c) }) + return columns +} + +func makeFileRowGroups(file *File, columns []*Column) []FileRowGroup { + rowGroups := file.metadata.RowGroups + fileRowGroups := make([]FileRowGroup, len(rowGroups)) + for i := range fileRowGroups { + fileRowGroups[i].init(file, columns, &rowGroups[i]) + } + return fileRowGroups +} + +func makeRowGroups(fileRowGroups []FileRowGroup) []RowGroup { + rowGroups := make([]RowGroup, len(fileRowGroups)) + for i := range fileRowGroups { + rowGroups[i] = &fileRowGroups[i] + } + return rowGroups +} diff --git a/vendor/github.com/parquet-go/parquet-go/row.go b/vendor/github.com/parquet-go/parquet-go/row.go index c7f314925bfbd..cc8d4f278030a 100644 --- a/vendor/github.com/parquet-go/parquet-go/row.go +++ b/vendor/github.com/parquet-go/parquet-go/row.go @@ -127,9 +127,7 @@ type RowSeeker interface { // RowReader reads a sequence of parquet rows. type RowReader interface { // ReadRows reads rows from the reader, returning the number of rows read - // into the buffer, and any error that occurred. Note that the rows read - // into the buffer are not safe for reuse after a subsequent call to - // ReadRows. Callers that want to reuse rows must copy the rows using Clone. + // into the buffer, and any error that occurred. // // When all rows have been read, the reader returns io.EOF to indicate the // end of the sequence. It is valid for the reader to return both a non-zero @@ -165,6 +163,21 @@ type RowReadSeeker interface { RowSeeker } +// RowReadCloser is an interface implemented by row readers which require +// closing when done. +type RowReadCloser interface { + RowReader + io.Closer +} + +// RowReadSeekCloser is an interface implemented by row readers which support +// seeking to arbitrary row positions and required closing the reader when done. +type RowReadSeekCloser interface { + RowReader + RowSeeker + io.Closer +} + // RowWriter writes parquet rows to an underlying medium. type RowWriter interface { // Writes rows to the writer, returning the number of rows written and any @@ -298,7 +311,7 @@ func copyRows(dst RowWriter, src RowReader, buf []Row) (written int64, err error sourceSchema := sourceSchemaOf(src) if targetSchema != nil && sourceSchema != nil { - if !nodesAreEqual(targetSchema, sourceSchema) { + if !EqualNodes(targetSchema, sourceSchema) { conv, err := Convert(targetSchema, sourceSchema) if err != nil { return 0, err @@ -582,7 +595,7 @@ func reconstructFuncOfOptional(columnIndex int16, node Node) (int16, reconstruct levels.definitionLevel++ if columns[0][0].definitionLevel < levels.definitionLevel { - value.Set(reflect.Zero(value.Type())) + value.SetZero() return nil } @@ -600,13 +613,23 @@ func reconstructFuncOfOptional(columnIndex int16, node Node) (int16, reconstruct func setMakeSlice(v reflect.Value, n int) reflect.Value { t := v.Type() if t.Kind() == reflect.Interface { - t = reflect.TypeOf(([]interface{})(nil)) + t = reflect.TypeOf(([]any)(nil)) } s := reflect.MakeSlice(t, n, n) v.Set(s) return s } +func setNullSlice(v reflect.Value) reflect.Value { + t := v.Type() + if t.Kind() == reflect.Interface { + t = reflect.TypeOf(([]any)(nil)) + } + s := reflect.Zero(t) + v.Set(s) + return s +} + //go:noinline func reconstructFuncOfRepeated(columnIndex int16, node Node) (int16, reconstructFunc) { nextColumnIndex, reconstruct := reconstructFuncOf(columnIndex, Required(node)) @@ -686,7 +709,6 @@ func reconstructFuncOfMap(columnIndex int16, node Node) (int16, reconstructFunc) keyValue := mapKeyValueOf(node) keyValueType := keyValue.GoType() keyValueElem := keyValueType.Elem() - keyValueZero := reflect.Zero(keyValueElem) nextColumnIndex, reconstruct := reconstructFuncOf(columnIndex, schemaOf(keyValueElem)) return nextColumnIndex, func(value reflect.Value, levels levels, columns [][]Value) error { levels.repetitionDepth++ @@ -723,7 +745,7 @@ func reconstructFuncOfMap(columnIndex int16, node Node) (int16, reconstructFunc) if value.IsNil() { m := reflect.MakeMapWithSize(t, n) value.Set(m) - value = m // track map instead of interface{} for read[any]() + value = m // track map instead of any for read[any]() } elem := reflect.New(keyValueElem).Elem() @@ -748,7 +770,7 @@ func reconstructFuncOfMap(columnIndex int16, node Node) (int16, reconstructFunc) } value.SetMapIndex(elem.Field(0).Convert(k), elem.Field(1).Convert(v)) - elem.Set(keyValueZero) + elem.SetZero() levels.repetitionLevel = levels.repetitionDepth } @@ -770,7 +792,7 @@ func reconstructFuncOfGroup(columnIndex int16, node Node) (int16, reconstructFun return columnIndex, func(value reflect.Value, levels levels, columns [][]Value) error { if value.Kind() == reflect.Interface { - value.Set(reflect.MakeMap(reflect.TypeOf((map[string]interface{})(nil)))) + value.Set(reflect.MakeMap(reflect.TypeOf((map[string]any)(nil)))) value = value.Elem() } @@ -778,7 +800,6 @@ func reconstructFuncOfGroup(columnIndex int16, node Node) (int16, reconstructFun elemType := value.Type().Elem() name := reflect.New(reflect.TypeOf("")).Elem() elem := reflect.New(elemType).Elem() - zero := reflect.Zero(elemType) if value.Len() > 0 { value.Set(reflect.MakeMap(value.Type())) @@ -795,7 +816,7 @@ func reconstructFuncOfGroup(columnIndex int16, node Node) (int16, reconstructFun } off = end value.SetMapIndex(name, elem) - elem.Set(zero) + elem.SetZero() } } else { off := int16(0) diff --git a/vendor/github.com/parquet-go/parquet-go/row_group.go b/vendor/github.com/parquet-go/parquet-go/row_group.go index cf11bfca11782..43a6d7a9d73de 100644 --- a/vendor/github.com/parquet-go/parquet-go/row_group.go +++ b/vendor/github.com/parquet-go/parquet-go/row_group.go @@ -1,6 +1,7 @@ package parquet import ( + "errors" "fmt" "io" @@ -60,9 +61,8 @@ type RowGroup interface { // // After calling Close, all attempts to read more rows will return io.EOF. type Rows interface { - RowReaderWithSchema - RowSeeker - io.Closer + RowReadSeekCloser + Schema() *Schema } // RowGroupReader is an interface implemented by types that expose sequences of @@ -163,140 +163,124 @@ func (r *rowGroup) NumRows() int64 { return r.numRows } func (r *rowGroup) ColumnChunks() []ColumnChunk { return r.columns } func (r *rowGroup) SortingColumns() []SortingColumn { return r.sorting } func (r *rowGroup) Schema() *Schema { return r.schema } -func (r *rowGroup) Rows() Rows { return newRowGroupRows(r, ReadModeSync) } - -func NewRowGroupRowReader(rowGroup RowGroup) Rows { - return newRowGroupRows(rowGroup, ReadModeSync) +func (r *rowGroup) Rows() Rows { return NewRowGroupRowReader(r) } + +func AsyncRowGroup(base RowGroup) RowGroup { + columnChunks := base.ColumnChunks() + asyncRowGroup := &rowGroup{ + schema: base.Schema(), + numRows: base.NumRows(), + sorting: base.SortingColumns(), + columns: make([]ColumnChunk, len(columnChunks)), + } + asyncColumnChunks := make([]asyncColumnChunk, len(columnChunks)) + for i, columnChunk := range columnChunks { + asyncColumnChunks[i].ColumnChunk = columnChunk + asyncRowGroup.columns[i] = &asyncColumnChunks[i] + } + return asyncRowGroup } type rowGroupRows struct { - rowGroup RowGroup - buffers []Value - readers []Pages - columns []columnChunkRows - inited bool - closed bool - done chan<- struct{} - pageReadMode ReadMode + schema *Schema + bufsize int + buffers []Value + columns []columnChunkRows + closed bool + rowIndex int64 } type columnChunkRows struct { - rows int64 offset int32 length int32 - page Page - values ValueReader + reader columnChunkValueReader } -const columnBufferSize = defaultValueBufferSize - func (r *rowGroupRows) buffer(i int) []Value { - j := (i + 0) * columnBufferSize - k := (i + 1) * columnBufferSize + j := (i + 0) * r.bufsize + k := (i + 1) * r.bufsize return r.buffers[j:k:k] } -func newRowGroupRows(rowGroup RowGroup, pageReadMode ReadMode) *rowGroupRows { - return &rowGroupRows{ - rowGroup: rowGroup, - pageReadMode: pageReadMode, - } +// / NewRowGroupRowReader constructs a new row reader for the given row group. +func NewRowGroupRowReader(rowGroup RowGroup) Rows { + return newRowGroupRows(rowGroup.Schema(), rowGroup.ColumnChunks(), defaultValueBufferSize) } -func (r *rowGroupRows) init() { - columns := r.rowGroup.ColumnChunks() - - r.buffers = make([]Value, len(columns)*columnBufferSize) - r.readers = make([]Pages, len(columns)) - r.columns = make([]columnChunkRows, len(columns)) +func newRowGroupRows(schema *Schema, columns []ColumnChunk, bufferSize int) *rowGroupRows { + r := &rowGroupRows{ + schema: schema, + bufsize: bufferSize, + buffers: make([]Value, len(columns)*bufferSize), + columns: make([]columnChunkRows, len(columns)), + rowIndex: -1, + } - switch r.pageReadMode { - case ReadModeAsync: - done := make(chan struct{}) - r.done = done - readers := make([]asyncPages, len(columns)) - for i, column := range columns { - readers[i].init(column.Pages(), done) - r.readers[i] = &readers[i] + for i, column := range columns { + var release func(Page) + // Only release pages that are not byte array because the values + // that were read from the page might be retained by the program + // after calls to ReadRows. + switch column.Type().Kind() { + case ByteArray, FixedLenByteArray: + release = func(Page) {} + default: + release = Release } - case ReadModeSync: - for i, column := range columns { - r.readers[i] = column.Pages() - } - default: - panic(fmt.Sprintf("parquet: invalid page read mode: %d", r.pageReadMode)) + r.columns[i].reader.release = release + r.columns[i].reader.pages = column.Pages() } - r.inited = true // This finalizer is used to ensure that the goroutines started by calling // init on the underlying page readers will be shutdown in the event that // Close isn't called and the rowGroupRows object is garbage collected. debug.SetFinalizer(r, func(r *rowGroupRows) { r.Close() }) + return r } func (r *rowGroupRows) clear() { - for i := range r.columns { - Release(r.columns[i].page) - } - - for i := range r.columns { - r.columns[i] = columnChunkRows{} - } - - for i := range r.buffers { - r.buffers[i] = Value{} + for i, c := range r.columns { + r.columns[i] = columnChunkRows{reader: c.reader} } + clear(r.buffers) } func (r *rowGroupRows) Reset() { - for i := range r.readers { - // Ignore errors because we are resetting the reader, if the error - // persists we will see it on the next read, and otherwise we can - // read back from the beginning. - r.readers[i].SeekToRow(0) + for i := range r.columns { + r.columns[i].reader.Reset() } r.clear() } func (r *rowGroupRows) Close() error { - var lastErr error - - if r.done != nil { - close(r.done) - r.done = nil - } - - for i := range r.readers { - if err := r.readers[i].Close(); err != nil { - lastErr = err + var errs []error + for i := range r.columns { + c := &r.columns[i] + c.offset = 0 + c.length = 0 + if err := c.reader.Close(); err != nil { + errs = append(errs, err) } } - r.clear() - r.inited = true r.closed = true - return lastErr + return errors.Join(errs...) } func (r *rowGroupRows) SeekToRow(rowIndex int64) error { - var lastErr error - if r.closed { return io.ErrClosedPipe } - - if !r.inited { - r.init() - } - - for i := range r.readers { - if err := r.readers[i].SeekToRow(rowIndex); err != nil { - lastErr = err + if rowIndex != r.rowIndex { + for i := range r.columns { + if err := r.columns[i].reader.SeekToRow(rowIndex); err != nil { + return err + } } + r.clear() + r.rowIndex = rowIndex } - - r.clear() - return lastErr + return nil } func (r *rowGroupRows) ReadRows(rows []Row) (int, error) { @@ -304,115 +288,82 @@ func (r *rowGroupRows) ReadRows(rows []Row) (int, error) { return 0, io.EOF } - if !r.inited { - r.init() + for rowIndex := range rows { + rows[rowIndex] = rows[rowIndex][:0] } - // Limit the number of rows that we read to the smallest number of rows - // remaining in the current page of each column. This is necessary because - // the pointers exposed to the returned rows need to remain valid until the - // next call to ReadRows, SeekToRow, Reset, or Close. If we release one of - // the columns' page, the rows that were already read during the ReadRows - // call would be invalidated, and might reference memory locations that have - // been reused due to pooling of page buffers. - numRows := int64(len(rows)) - - for i := range r.columns { - c := &r.columns[i] - // When all rows of the current page of a column have been consumed we - // have to read the next page. This will effectively invalidate all - // pointers of values previously held in the page, which is valid if - // the application respects the RowReader interface and does not retain - // parquet values without cloning them first. - for c.rows == 0 { - var err error - clearValues(r.buffer(i)) - - c.offset = 0 - c.length = 0 - c.values = nil - Release(c.page) - - c.page, err = r.readers[i].ReadPage() - if err != nil { - if err != io.EOF { - return 0, err - } - break - } - - c.rows = c.page.NumRows() - c.values = c.page.Values() - } - - if c.rows < numRows { - numRows = c.rows + // When this is the first call to ReadRows, we issue a seek to the first row + // because this starts prefetching pages asynchronously on columns. + // + // This condition does not apply if SeekToRow was called before ReadRows, + // only when ReadRows is the very first method called on the row reader. + if r.rowIndex < 0 { + if err := r.SeekToRow(0); err != nil { + return 0, err } } - for i := range rows { - rows[i] = rows[i][:0] - } - - if numRows == 0 { - return 0, io.EOF - } + eofCount := 0 + rowCount := 0 - n, err := r.readRows(rows[:numRows]) +readColumnValues: + for columnIndex := range r.columns { + c := &r.columns[columnIndex] + b := r.buffer(columnIndex) + eof := false - for i := range r.columns { - r.columns[i].rows -= int64(n) - } + for rowIndex := range rows { + numValuesInRow := 1 - return n, err -} - -func (r *rowGroupRows) Schema() *Schema { - return r.rowGroup.Schema() -} - -func (r *rowGroupRows) readRows(rows []Row) (int, error) { - for i := range rows { - readColumns: - for columnIndex := range r.columns { - col := &r.columns[columnIndex] - buf := r.buffer(columnIndex) - - skip := int32(1) for { - if col.offset == col.length { - n, err := col.values.ReadValues(buf) + if c.offset == c.length { + n, err := c.reader.ReadValues(b) + c.offset = 0 + c.length = int32(n) + if n == 0 { - switch err { - case nil: - err = io.ErrNoProgress - case io.EOF: - continue readColumns + if err == io.EOF { + eof = true + eofCount++ + break } - return i, err + return 0, err } - col.offset = 0 - col.length = int32(n) } - _ = buf[:col.offset] - _ = buf[:col.length] - endOffset := col.offset + skip - - for endOffset < col.length && buf[endOffset].repetitionLevel != 0 { - endOffset++ + values := b[c.offset:c.length:c.length] + for numValuesInRow < len(values) && values[numValuesInRow].repetitionLevel != 0 { + numValuesInRow++ + } + if numValuesInRow == 0 { + break } - rows[i] = append(rows[i], buf[col.offset:endOffset]...) + rows[rowIndex] = append(rows[rowIndex], values[:numValuesInRow]...) + rowCount = max(rowCount, rowIndex+1) + c.offset += int32(numValuesInRow) - if col.offset = endOffset; col.offset < col.length { + if numValuesInRow != len(values) { break } - skip = 0 + if eof { + continue readColumnValues + } + numValuesInRow = 0 } } } - return len(rows), nil + + var err error + if eofCount > 0 { + err = io.EOF + } + r.rowIndex += int64(rowCount) + return rowCount, err +} + +func (r *rowGroupRows) Schema() *Schema { + return r.schema } type seekRowGroup struct { diff --git a/vendor/github.com/parquet-go/parquet-go/schema.go b/vendor/github.com/parquet-go/parquet-go/schema.go index 41935a5a2a933..710169fa18959 100644 --- a/vendor/github.com/parquet-go/parquet-go/schema.go +++ b/vendor/github.com/parquet-go/parquet-go/schema.go @@ -19,13 +19,35 @@ import ( // // Schema implements the Node interface to represent the root node of a parquet // schema. +// +// Schema values are safe to use concurrently from multiple goroutines but must +// be passed by referenced after being created because their internal state +// contains synchronization primitives that are not safe to copy. type Schema struct { - name string - root Node + name string + root Node + funcs onceValue[schemaFuncs] + state onceValue[schemaState] +} + +type schemaFuncs struct { deconstruct deconstructFunc reconstruct reconstructFunc - mapping columnMapping - columns [][]string +} + +type schemaState struct { + mapping columnMapping + columns [][]string +} + +type onceValue[T any] struct { + once sync.Once + value *T +} + +func (v *onceValue[T]) load(f func() *T) *T { + v.once.Do(func() { v.value = f() }) + return v.value } // SchemaOf constructs a parquet schema from a Go value. @@ -56,6 +78,7 @@ type Schema struct { // uuid | for string and [16]byte types, use the parquet UUID logical type // decimal | for int32, int64 and [n]byte types, use the parquet DECIMAL logical type // date | for int32 types use the DATE logical type +// time | for int32 and int64 types use the TIME logical type // timestamp | for int64 types use the TIMESTAMP logical type with, by default, millisecond precision // split | for float32/float64, use the BYTE_STREAM_SPLIT encoding // id(n) | where n is int denoting a column field id. Example id(2) for a column with field id of 2 @@ -66,7 +89,18 @@ type Schema struct { // Supported precisions are: nanosecond, millisecond and microsecond. Example: // // type Message struct { -// TimestrampMicros int64 `parquet:"timestamp_micros,timestamp(microsecond)" +// TimestampMicros int64 `parquet:"timestamp_micros,timestamp(microsecond)" +// } +// +// Both the time and timestamp tags accept an optional second parameter +// to set the `isAdjustedToUTC` annotation of the parquet logical type. +// Valid values are "utc" or "local". If not specified, the default value +// for this annotation will be "utc", which will set the `isAdjustedToUTC` annotation +// value to true. Example: +// +// type Message struct { +// TimestampMicrosAdjusted int64 `parquet:"timestamp_micros_adjusted,timestamp(microsecond:utc)" +// TimestampMicrosNotAdjusted int64 `parquet:"timestamp_micros_not_adjusted,timestamp(microsecond:local)" // } // // The decimal tag must be followed by two integer parameters, the first integer @@ -123,15 +157,7 @@ func schemaOf(model reflect.Type) *Schema { // The function panics if Node contains more leaf columns than supported by the // package (see parquet.MaxColumnIndex). func NewSchema(name string, root Node) *Schema { - mapping, columns := columnMappingOf(root) - return &Schema{ - name: name, - root: root, - deconstruct: makeDeconstructFunc(root), - reconstruct: makeReconstructFunc(root), - mapping: mapping, - columns: columns, - } + return &Schema{name: name, root: root} } func dereference(t reflect.Type) reflect.Type { @@ -143,7 +169,7 @@ func dereference(t reflect.Type) reflect.Type { func makeDeconstructFunc(node Node) (deconstruct deconstructFunc) { if schema, _ := node.(*Schema); schema != nil { - return schema.deconstruct + return schema.lazyLoadFuncs().deconstruct } if !node.Leaf() { _, deconstruct = deconstructFuncOf(0, node) @@ -153,7 +179,7 @@ func makeDeconstructFunc(node Node) (deconstruct deconstructFunc) { func makeReconstructFunc(node Node) (reconstruct reconstructFunc) { if schema, _ := node.(*Schema); schema != nil { - return schema.reconstruct + return schema.lazyLoadFuncs().reconstruct } if !node.Leaf() { _, reconstruct = reconstructFuncOf(0, node) @@ -161,6 +187,25 @@ func makeReconstructFunc(node Node) (reconstruct reconstructFunc) { return reconstruct } +func (s *Schema) lazyLoadFuncs() *schemaFuncs { + return s.funcs.load(func() *schemaFuncs { + return &schemaFuncs{ + deconstruct: makeDeconstructFunc(s.root), + reconstruct: makeReconstructFunc(s.root), + } + }) +} + +func (s *Schema) lazyLoadState() *schemaState { + return s.state.load(func() *schemaState { + mapping, columns := columnMappingOf(s.root) + return &schemaState{ + mapping: mapping, + columns: columns, + } + }) +} + // ConfigureRowGroup satisfies the RowGroupOption interface, allowing Schema // instances to be passed to row group constructors to pre-declare the schema of // the output parquet file. @@ -218,26 +263,25 @@ func (s *Schema) GoType() reflect.Type { return s.root.GoType() } // The method panics is the structure of the go value does not match the // parquet schema. func (s *Schema) Deconstruct(row Row, value interface{}) Row { - columns := make([][]Value, len(s.columns)) - values := make([]Value, len(s.columns)) + state := s.lazyLoadState() + funcs := s.lazyLoadFuncs() + columns := make([][]Value, len(state.columns)) + values := make([]Value, len(state.columns)) for i := range columns { columns[i] = values[i : i : i+1] } - s.deconstructValueToColumns(columns, reflect.ValueOf(value)) - return appendRow(row, columns) -} - -func (s *Schema) deconstructValueToColumns(columns [][]Value, value reflect.Value) { - for value.Kind() == reflect.Ptr || value.Kind() == reflect.Interface { - if value.IsNil() { - value = reflect.Value{} + v := reflect.ValueOf(value) + for v.Kind() == reflect.Ptr || v.Kind() == reflect.Interface { + if v.IsNil() { + v = reflect.Value{} break } - value = value.Elem() + v = v.Elem() } - s.deconstruct(columns, levels{}, value) + funcs.deconstruct(columns, levels{}, v) + return appendRow(row, columns) } // Reconstruct reconstructs a Go value from a row. @@ -267,7 +311,9 @@ func (s *Schema) Reconstruct(value interface{}, row Row) error { b := valuesSliceBufferPool.Get().(*valuesSliceBuffer) - columns := b.reserve(len(s.columns)) + state := s.lazyLoadState() + funcs := s.lazyLoadFuncs() + columns := b.reserve(len(state.columns)) row.Range(func(columnIndex int, columnValues []Value) bool { if columnIndex < len(columns) { columns[columnIndex] = columnValues @@ -275,7 +321,7 @@ func (s *Schema) Reconstruct(value interface{}, row Row) error { return true }) // we avoid the defer penalty by releasing b manually - err := s.reconstruct(v, levels{}, columns) + err := funcs.reconstruct(v, levels{}, columns) b.release() return err } @@ -320,7 +366,7 @@ var valuesSliceBufferPool = &sync.Pool{ // If the path was not found in the mapping, or if it did not represent a // leaf column of the parquet schema, the boolean will be false. func (s *Schema) Lookup(path ...string) (LeafColumn, bool) { - leaf := s.mapping.lookup(path) + leaf := s.lazyLoadState().mapping.lookup(path) return LeafColumn{ Node: leaf.node, Path: leaf.path, @@ -334,9 +380,7 @@ func (s *Schema) Lookup(path ...string) (LeafColumn, bool) { // // The method always returns the same slice value across calls to ColumnPaths, // applications should treat it as immutable. -func (s *Schema) Columns() [][]string { - return s.columns -} +func (s *Schema) Columns() [][]string { return s.lazyLoadState().columns } // Comparator constructs a comparator function which orders rows according to // the list of sorting columns passed as arguments. @@ -678,19 +722,41 @@ func parseIDArgs(args string) (int, error) { return strconv.Atoi(args) } -func parseTimestampArgs(args string) (TimeUnit, error) { +func parseTimestampArgs(args string) (unit TimeUnit, isUTCNormalized bool, err error) { if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") { - return nil, fmt.Errorf("malformed timestamp args: %s", args) + return nil, false, fmt.Errorf("malformed timestamp args: %s", args) } args = strings.TrimPrefix(args, "(") args = strings.TrimSuffix(args, ")") if len(args) == 0 { - return Millisecond, nil + return Millisecond, true, nil + } + + parts := strings.Split(args, ":") + if len(parts) > 2 { + return nil, false, fmt.Errorf("malformed timestamp args: (%s)", args) } - switch args { + unit, err = parseTimeUnit(parts[0]) + if err != nil { + return nil, false, err + } + + adjusted := true + if len(parts) > 1 { + adjusted, err = parseUTCNormalization(parts[1]) + if err != nil { + return nil, false, err + } + } + + return unit, adjusted, nil +} + +func parseTimeUnit(arg string) (TimeUnit, error) { + switch arg { case "millisecond": return Millisecond, nil case "microsecond": @@ -700,7 +766,18 @@ func parseTimestampArgs(args string) (TimeUnit, error) { default: } - return nil, fmt.Errorf("unknown time unit: %s", args) + return nil, fmt.Errorf("unknown time unit: %s", arg) +} + +func parseUTCNormalization(arg string) (isUTCNormalized bool, err error) { + switch arg { + case "utc": + return true, nil + case "local": + return false, nil + default: + return false, fmt.Errorf("unknown utc normalization: %s", arg) + } } type goNode struct { @@ -887,22 +964,46 @@ func makeNodeOf(t reflect.Type, name string, tag []string) Node { default: throwInvalidTag(t, name, option) } + case "time": + switch t.Kind() { + case reflect.Int32: + timeUnit, adjusted, err := parseTimestampArgs(args) + if err != nil || timeUnit.Duration() < time.Millisecond { + throwInvalidTag(t, name, option+args) + } + setNode(TimeAdjusted(timeUnit, adjusted)) + case reflect.Int64: + timeUnit, adjusted, err := parseTimestampArgs(args) + if t == reflect.TypeOf(time.Duration(0)) { + if args == "()" { + timeUnit = Nanosecond + } else if timeUnit != Nanosecond { + throwInvalidTag(t, name, option+args) + } + } + if err != nil || timeUnit.Duration() == time.Millisecond { + throwInvalidTag(t, name, option+args) + } + setNode(TimeAdjusted(timeUnit, adjusted)) + default: + throwInvalidTag(t, name, option) + } case "timestamp": switch t.Kind() { case reflect.Int64: - timeUnit, err := parseTimestampArgs(args) + timeUnit, adjusted, err := parseTimestampArgs(args) if err != nil { - throwInvalidTag(t, name, option) + throwInvalidTag(t, name, option+args) } - setNode(Timestamp(timeUnit)) + setNode(TimestampAdjusted(timeUnit, adjusted)) default: switch t { case reflect.TypeOf(time.Time{}): - timeUnit, err := parseTimestampArgs(args) + timeUnit, adjusted, err := parseTimestampArgs(args) if err != nil { - throwInvalidTag(t, name, option) + throwInvalidTag(t, name, option+args) } - setNode(Timestamp(timeUnit)) + setNode(TimestampAdjusted(timeUnit, adjusted)) default: throwInvalidTag(t, name, option) } diff --git a/vendor/github.com/parquet-go/parquet-go/type.go b/vendor/github.com/parquet-go/parquet-go/type.go index f5690a22e2b2d..c425c0d187dbc 100644 --- a/vendor/github.com/parquet-go/parquet-go/type.go +++ b/vendor/github.com/parquet-go/parquet-go/type.go @@ -1803,10 +1803,19 @@ func (u *nanosecond) TimeUnit() format.TimeUnit { } // Time constructs a leaf node of TIME logical type. +// IsAdjustedToUTC is true by default. // // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time func Time(unit TimeUnit) Node { - return Leaf(&timeType{IsAdjustedToUTC: true, Unit: unit.TimeUnit()}) + return TimeAdjusted(unit, true) +} + +// TimeAdjusted constructs a leaf node of TIME logical type +// with the IsAdjustedToUTC property explicitly set. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time +func TimeAdjusted(unit TimeUnit, isAdjustedToUTC bool) Node { + return Leaf(&timeType{IsAdjustedToUTC: isAdjustedToUTC, Unit: unit.TimeUnit()}) } type timeType format.TimeType @@ -1919,10 +1928,19 @@ func (t *timeType) ConvertValue(val Value, typ Type) (Value, error) { } // Timestamp constructs of leaf node of TIMESTAMP logical type. +// IsAdjustedToUTC is true by default. // // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp func Timestamp(unit TimeUnit) Node { - return Leaf(×tampType{IsAdjustedToUTC: true, Unit: unit.TimeUnit()}) + return TimestampAdjusted(unit, true) +} + +// TimestampAdjusted constructs a leaf node of TIMESTAMP logical type +// with the IsAdjustedToUTC property explicitly set. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time +func TimestampAdjusted(unit TimeUnit, isAdjustedToUTC bool) Node { + return Leaf(×tampType{IsAdjustedToUTC: isAdjustedToUTC, Unit: unit.TimeUnit()}) } type timestampType format.TimestampType @@ -2256,6 +2274,94 @@ func (t *nullType) ConvertValue(val Value, _ Type) (Value, error) { return val, nil } +// Variant constructs a node of unshredded VARIANT logical type. It is a group with +// two required fields, "metadata" and "value", both byte arrays. +// +// Experimental: The specification for variants is still being developed and the type +// is not fully adopted. Support for this type is subject to change. +// +// Initial support does not attempt to process the variant data. So reading and writing +// data of this type behaves as if it were just a group with two byte array fields, as +// if the logical type annotation were absent. This may change in the future. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#variant +func Variant() Node { + return variantNode{Group{"metadata": Required(Leaf(ByteArrayType)), "value": Required(Leaf(ByteArrayType))}} +} + +// TODO: add ShreddedVariant(Node) function, to create a shredded variant +// where the argument defines the type/structure of the shredded value(s). + +type variantNode struct{ Group } + +func (variantNode) Type() Type { return &variantType{} } + +type variantType format.VariantType + +func (t *variantType) String() string { return (*format.VariantType)(t).String() } + +func (t *variantType) Kind() Kind { panic("cannot call Kind on parquet VARIANT type") } + +func (t *variantType) Length() int { return 0 } + +func (t *variantType) EstimateSize(int) int { return 0 } + +func (t *variantType) EstimateNumValues(int) int { return 0 } + +func (t *variantType) Compare(Value, Value) int { + panic("cannot compare values on parquet VARIANT type") +} + +func (t *variantType) ColumnOrder() *format.ColumnOrder { return nil } + +func (t *variantType) PhysicalType() *format.Type { return nil } + +func (t *variantType) LogicalType() *format.LogicalType { + return &format.LogicalType{Variant: (*format.VariantType)(t)} +} + +func (t *variantType) ConvertedType() *deprecated.ConvertedType { return nil } + +func (t *variantType) NewColumnIndexer(int) ColumnIndexer { + panic("create create column indexer from parquet VARIANT type") +} + +func (t *variantType) NewDictionary(int, int, encoding.Values) Dictionary { + panic("cannot create dictionary from parquet VARIANT type") +} + +func (t *variantType) NewColumnBuffer(int, int) ColumnBuffer { + panic("cannot create column buffer from parquet VARIANT type") +} + +func (t *variantType) NewPage(int, int, encoding.Values) Page { + panic("cannot create page from parquet VARIANT type") +} + +func (t *variantType) NewValues(values []byte, _ []uint32) encoding.Values { + panic("cannot create values from parquet VARIANT type") +} + +func (t *variantType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { + panic("cannot encode parquet VARIANT type") +} + +func (t *variantType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { + panic("cannot decode parquet VARIANT type") +} + +func (t *variantType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { + panic("cannot estimate decode size of parquet VARIANT type") +} + +func (t *variantType) AssignValue(reflect.Value, Value) error { + panic("cannot assign value to a parquet VARIANT type") +} + +func (t *variantType) ConvertValue(Value, Type) (Value, error) { + panic("cannot convert value to a parquet VARIANT type") +} + type groupType struct{} func (groupType) String() string { return "group" } diff --git a/vendor/github.com/parquet-go/parquet-go/writer.go b/vendor/github.com/parquet-go/parquet-go/writer.go index 8d9e44c7ee2ba..88f7640c0ec45 100644 --- a/vendor/github.com/parquet-go/parquet-go/writer.go +++ b/vendor/github.com/parquet-go/parquet-go/writer.go @@ -3,6 +3,7 @@ package parquet import ( "bufio" "bytes" + "cmp" "encoding/binary" "fmt" "hash/crc32" @@ -12,7 +13,6 @@ import ( "os" "reflect" "slices" - "sort" "github.com/parquet-go/parquet-go/compress" "github.com/parquet-go/parquet-go/encoding" @@ -221,6 +221,10 @@ func (w *GenericWriter[T]) Schema() *Schema { return w.base.Schema() } +func (w *GenericWriter[T]) ColumnWriters() []ValueWriter { + return w.base.ColumnWriters() +} + func (w *GenericWriter[T]) writeRows(rows []T) (int, error) { if cap(w.base.rowbuf) < len(rows) { w.base.rowbuf = make([]Row, len(rows)) @@ -247,6 +251,12 @@ func (w *GenericWriter[T]) writeAny(rows []T) (n int, err error) { return n, nil } +// File returns a FileView of the written parquet file. +// Only available after Close is called. +func (w *GenericWriter[T]) File() FileView { + return w.base.File() +} + var ( _ RowWriterWithSchema = (*GenericWriter[any])(nil) _ RowReaderFrom = (*GenericWriter[any])(nil) @@ -415,7 +425,7 @@ func (w *Writer) WriteRowGroup(rowGroup RowGroup) (int64, error) { return 0, ErrRowGroupSchemaMissing case w.schema == nil: w.configure(rowGroupSchema) - case !nodesAreEqual(w.schema, rowGroupSchema): + case !EqualNodes(w.schema, rowGroupSchema): return 0, ErrRowGroupSchemaMismatch } if err := w.writer.flush(); err != nil { @@ -478,6 +488,80 @@ func (w *Writer) SetKeyValueMetadata(key, value string) { }) } +// ColumnWriters returns writers for each column. This allows applications to +// write values directly to each column instead of having to first assemble +// values into rows to use WriteRows. +func (w *Writer) ColumnWriters() []ValueWriter { return w.writer.valueWriters } + +type writerFileView struct { + writer *writer + schema *Schema +} + +// File returns a FileView of the written parquet file. +// Only available after Close is called. +func (w *Writer) File() FileView { + if w.writer == nil || w.schema == nil { + return nil + } + return &writerFileView{ + w.writer, + w.schema, + } +} + +func (w *writerFileView) Metadata() *format.FileMetaData { + return w.writer.fileMetaData +} + +func (w *writerFileView) Schema() *Schema { + return w.schema +} + +func (w *writerFileView) NumRows() int64 { + if w.writer.fileMetaData != nil { + return w.writer.fileMetaData.NumRows + } + return 0 +} + +func (w *writerFileView) Lookup(key string) (string, bool) { + if w.writer.fileMetaData != nil { + return lookupKeyValueMetadata(w.writer.fileMetaData.KeyValueMetadata, key) + } + return "", false +} + +func (w *writerFileView) Size() int64 { + return w.writer.writer.offset +} + +func (w *writerFileView) ColumnIndexes() []format.ColumnIndex { + return w.writer.columnIndex +} + +func (w *writerFileView) OffsetIndexes() []format.OffsetIndex { + return w.writer.offsetIndex +} + +func (w *writerFileView) Root() *Column { + if w.writer.fileMetaData != nil { + root, _ := openColumns(nil, w.writer.fileMetaData, w.writer.columnIndex, w.writer.offsetIndex) + return root + } + return nil +} + +func (w *writerFileView) RowGroups() []RowGroup { + if w.writer.fileMetaData != nil { + columns := makeLeafColumns(w.Root()) + file := &File{metadata: *w.writer.fileMetaData, schema: w.schema} + fileRowGroups := makeFileRowGroups(file, columns) + return makeRowGroups(fileRowGroups) + } + return nil +} + type writer struct { buffer *bufio.Writer writer offsetTrackingWriter @@ -488,10 +572,11 @@ type writer struct { createdBy string metadata []format.KeyValue - columns []*writerColumn - columnChunk []format.ColumnChunk - columnIndex []format.ColumnIndex - offsetIndex []format.OffsetIndex + columns []*writerColumn + valueWriters []ValueWriter + columnChunk []format.ColumnChunk + columnIndex []format.ColumnIndex + offsetIndex []format.OffsetIndex columnOrders []format.ColumnOrder schemaElements []format.SchemaElement @@ -499,6 +584,8 @@ type writer struct { columnIndexes [][]format.ColumnIndex offsetIndexes [][]format.OffsetIndex sortingColumns []format.SortingColumn + + fileMetaData *format.FileMetaData } func newWriter(output io.Writer, config *WriterConfig) *writer { @@ -677,6 +764,10 @@ func newWriter(output io.Writer, config *WriterConfig) *writer { for i, c := range w.columns { w.columnOrders[i] = *c.columnType.ColumnOrder() } + w.valueWriters = make([]ValueWriter, len(w.columns)) + for i, c := range w.columns { + w.valueWriters[i] = c + } return w } @@ -703,6 +794,7 @@ func (w *writer) reset(writer io.Writer) { w.rowGroups = w.rowGroups[:0] w.columnIndexes = w.columnIndexes[:0] w.offsetIndexes = w.offsetIndexes[:0] + w.fileMetaData = nil } func (w *writer) close() error { @@ -761,6 +853,7 @@ func (w *writer) writeFileFooter() error { protocol := new(thrift.CompactProtocol) encoder := thrift.NewEncoder(protocol.NewWriter(&w.writer)) + w.columnIndex = w.columnIndex[:0] for i, columnIndexes := range w.columnIndexes { rowGroup := &w.rowGroups[i] for j := range columnIndexes { @@ -771,8 +864,10 @@ func (w *writer) writeFileFooter() error { } column.ColumnIndexLength = int32(w.writer.offset - column.ColumnIndexOffset) } + w.columnIndex = append(w.columnIndex, columnIndexes...) } + w.offsetIndex = w.offsetIndex[:0] for i, offsetIndexes := range w.offsetIndexes { rowGroup := &w.rowGroups[i] for j := range offsetIndexes { @@ -783,6 +878,7 @@ func (w *writer) writeFileFooter() error { } column.OffsetIndexLength = int32(w.writer.offset - column.OffsetIndexOffset) } + w.offsetIndex = append(w.offsetIndex, offsetIndexes...) } numRows := int64(0) @@ -797,7 +893,7 @@ func (w *writer) writeFileFooter() error { // https://github.com/apache/arrow/blob/70b9ef5/go/parquet/metadata/file.go#L122-L127 const parquetFileFormatVersion = 2 - footer, err := thrift.Marshal(new(thrift.CompactProtocol), &format.FileMetaData{ + w.fileMetaData = &format.FileMetaData{ Version: parquetFileFormatVersion, Schema: w.schemaElements, NumRows: numRows, @@ -805,7 +901,8 @@ func (w *writer) writeFileFooter() error { KeyValueMetadata: w.metadata, CreatedBy: w.createdBy, ColumnOrders: w.columnOrders, - }) + } + footer, err := thrift.Marshal(new(thrift.CompactProtocol), w.fileMetaData) if err != nil { return err } @@ -853,15 +950,6 @@ func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns [] } fileOffset := w.writer.offset - for _, c := range w.columns { - if len(c.filter) > 0 { - c.columnChunk.MetaData.BloomFilterOffset = w.writer.offset - if err := c.writeBloomFilter(&w.writer); err != nil { - return 0, err - } - } - } - for i, c := range w.columns { w.columnIndex[i] = format.ColumnIndex(c.columnIndex.ColumnIndex()) @@ -888,6 +976,15 @@ func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns [] } } + for _, c := range w.columns { + if len(c.filter) > 0 { + c.columnChunk.MetaData.BloomFilterOffset = w.writer.offset + if err := c.writeBloomFilter(&w.writer); err != nil { + return 0, err + } + } + } + totalByteSize := int64(0) totalCompressedSize := int64(0) @@ -1669,19 +1766,15 @@ addPages: } func sortPageEncodings(encodings []format.Encoding) { - sort.Slice(encodings, func(i, j int) bool { - return encodings[i] < encodings[j] - }) + slices.Sort(encodings) } func sortPageEncodingStats(stats []format.PageEncodingStats) { - sort.Slice(stats, func(i, j int) bool { - s1 := &stats[i] - s2 := &stats[j] - if s1.PageType != s2.PageType { - return s1.PageType < s2.PageType + slices.SortFunc(stats, func(s1, s2 format.PageEncodingStats) int { + if k := cmp.Compare(s1.PageType, s2.PageType); k != 0 { + return k } - return s1.Encoding < s2.Encoding + return cmp.Compare(s1.Encoding, s2.Encoding) }) } diff --git a/vendor/modules.txt b/vendor/modules.txt index b9b2b4e366184..8f0f18ad5fd81 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1410,7 +1410,7 @@ github.com/oschwald/geoip2-golang # github.com/oschwald/maxminddb-golang v1.13.0 ## explicit; go 1.21 github.com/oschwald/maxminddb-golang -# github.com/parquet-go/parquet-go v0.24.0 +# github.com/parquet-go/parquet-go v0.25.0 ## explicit; go 1.22 github.com/parquet-go/parquet-go github.com/parquet-go/parquet-go/bloom