diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go index 48b0df9c8431a..230c985929d3b 100644 --- a/go/arrow/array/compare.go +++ b/go/arrow/array/compare.go @@ -22,6 +22,7 @@ import ( "github.com/apache/arrow/go/v13/arrow" "github.com/apache/arrow/go/v13/arrow/float16" + "github.com/apache/arrow/go/v13/internal/bitutils" ) // RecordEqual reports whether the two provided records are equal. @@ -735,13 +736,22 @@ func arrayApproxEqualFixedSizeList(left, right *FixedSizeList, opt equalOption) } func arrayApproxEqualStruct(left, right *Struct, opt equalOption) bool { - for i, lf := range left.fields { - rf := right.fields[i] - if !arrayApproxEqual(lf, rf, opt) { - return false + return bitutils.VisitSetBitRuns( + left.NullBitmapBytes(), + int64(left.Offset()), int64(left.Len()), + approxEqualStructRun(left, right, opt), + ) == nil +} + +func approxEqualStructRun(left, right *Struct, opt equalOption) bitutils.VisitFn { + return func(pos int64, length int64) error { + for i := range left.fields { + if !sliceApproxEqual(left.fields[i], pos, pos+length, right.fields[i], pos, pos+length, opt) { + return arrow.ErrInvalid + } } + return nil } - return true } // arrayApproxEqualMap doesn't care about the order of keys (in Go map traversal order is undefined) diff --git a/go/arrow/array/struct.go b/go/arrow/array/struct.go index cb4fb5ec5e526..9ebc794006ad5 100644 --- a/go/arrow/array/struct.go +++ b/go/arrow/array/struct.go @@ -129,7 +129,7 @@ func (a *Struct) newStructFieldWithParentValidityMask(fieldIndex int) arrow.Arra maskedNullBitmapBytes := make([]byte, len(nullBitmapBytes)) copy(maskedNullBitmapBytes, nullBitmapBytes) for i := 0; i < field.Len(); i++ { - if !a.IsValid(i) { + if a.IsNull(i) { bitutil.ClearBit(maskedNullBitmapBytes, i) } } diff --git a/go/parquet/pqarrow/column_readers.go b/go/parquet/pqarrow/column_readers.go index df02b2c949e4e..767131f1a88b6 100644 --- a/go/parquet/pqarrow/column_readers.go +++ b/go/parquet/pqarrow/column_readers.go @@ -168,16 +168,6 @@ func (sr *structReader) Release() { } func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.LevelInfo, children []*ColumnReader, props ArrowReadProperties) *ColumnReader { - // there could be a mix of children some might be repeated and some might not be - // if possible use one that isn't since that will be guaranteed to have the least - // number of levels to reconstruct a nullable bitmap - var result *ColumnReader - for _, child := range children { - if !child.IsOrHasRepeatedChild() { - result = child - } - } - ret := &structReader{ rctx: rctx, filtered: filtered, @@ -186,10 +176,18 @@ func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.Leve props: props, refCount: 1, } - if result != nil { - ret.defRepLevelChild = result - ret.hasRepeatedChild = false - } else { + + // there could be a mix of children some might be repeated and some might not be + // if possible use one that isn't since that will be guaranteed to have the least + // number of levels to reconstruct a nullable bitmap + for _, child := range children { + if !child.IsOrHasRepeatedChild() { + ret.defRepLevelChild = child + break + } + } + + if ret.defRepLevelChild == nil { ret.defRepLevelChild = children[0] ret.hasRepeatedChild = true } @@ -250,15 +248,16 @@ func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) { var nullBitmap *memory.Buffer - if lenBound > 0 { + if lenBound > 0 && (sr.hasRepeatedChild || sr.filtered.Nullable) { + nullBitmap = memory.NewResizableBuffer(sr.rctx.mem) + nullBitmap.Resize(int(bitutil.BytesForBits(lenBound))) + validityIO.ValidBits = nullBitmap.Bytes() + defLevels, err := sr.GetDefLevels() + if err != nil { + return nil, err + } + if sr.hasRepeatedChild { - nullBitmap = memory.NewResizableBuffer(sr.rctx.mem) - nullBitmap.Resize(int(bitutil.BytesForBits(lenBound))) - validityIO.ValidBits = nullBitmap.Bytes() - defLevels, err := sr.GetDefLevels() - if err != nil { - return nil, err - } repLevels, err := sr.GetRepLevels() if err != nil { return nil, err @@ -267,16 +266,7 @@ func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) { if err := file.DefRepLevelsToBitmap(defLevels, repLevels, sr.levelInfo, &validityIO); err != nil { return nil, err } - - } else if sr.filtered.Nullable { - nullBitmap = memory.NewResizableBuffer(sr.rctx.mem) - nullBitmap.Resize(int(bitutil.BytesForBits(lenBound))) - validityIO.ValidBits = nullBitmap.Bytes() - defLevels, err := sr.GetDefLevels() - if err != nil { - return nil, err - } - + } else { file.DefLevelsToBitmap(defLevels, sr.levelInfo, &validityIO) } } diff --git a/go/parquet/pqarrow/encode_arrow_test.go b/go/parquet/pqarrow/encode_arrow_test.go index 877f584f20c8b..31f8a2b5abaa5 100644 --- a/go/parquet/pqarrow/encode_arrow_test.go +++ b/go/parquet/pqarrow/encode_arrow_test.go @@ -19,7 +19,6 @@ package pqarrow_test import ( "bytes" "context" - "errors" "fmt" "math" "strconv" @@ -32,7 +31,6 @@ import ( "github.com/apache/arrow/go/v13/arrow/decimal128" "github.com/apache/arrow/go/v13/arrow/ipc" "github.com/apache/arrow/go/v13/arrow/memory" - "github.com/apache/arrow/go/v13/internal/bitutils" "github.com/apache/arrow/go/v13/internal/types" "github.com/apache/arrow/go/v13/internal/utils" "github.com/apache/arrow/go/v13/parquet" @@ -931,6 +929,44 @@ func (ps *ParquetIOTestSuite) TestReadDecimals() { ps.True(array.Equal(expected, chunked.Chunk(0))) } +func (ps *ParquetIOTestSuite) TestReadNestedStruct() { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(ps.T(), 0) + + dt := arrow.StructOf(arrow.Field{ + Name: "nested", + Type: arrow.StructOf( + arrow.Field{Name: "bool", Type: arrow.FixedWidthTypes.Boolean}, + arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32}, + arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64}, + ), + }) + field := arrow.Field{Name: "struct", Type: dt, Nullable: true} + + builder := array.NewStructBuilder(mem, dt) + defer builder.Release() + nested := builder.FieldBuilder(0).(*array.StructBuilder) + + builder.Append(true) + nested.Append(true) + nested.FieldBuilder(0).(*array.BooleanBuilder).Append(true) + nested.FieldBuilder(1).(*array.Int32Builder).Append(int32(-1)) + nested.FieldBuilder(2).(*array.Int64Builder).Append(int64(-2)) + builder.AppendNull() + + arr := builder.NewStructArray() + defer arr.Release() + + expected := array.NewTable( + arrow.NewSchema([]arrow.Field{field}, nil), + []arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(dt, []arrow.Array{arr}))}, + -1, + ) + defer arr.Release() // NewChunked + defer expected.Release() + ps.roundTripTable(mem, expected, true) +} + func (ps *ParquetIOTestSuite) writeColumn(mem memory.Allocator, sc *schema.GroupNode, values arrow.Array) []byte { var buf bytes.Buffer arrsc, err := pqarrow.FromParquet(schema.NewSchema(sc), nil, nil) @@ -1032,29 +1068,9 @@ func (ps *ParquetIOTestSuite) roundTripTable(_ memory.Allocator, expected arrow. tblChunk := tbl.Column(0).Data() ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks())) - if exChunk.DataType().ID() != arrow.STRUCT { - exc := exChunk.Chunk(0) - tbc := tblChunk.Chunk(0) - ps.Truef(array.Equal(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc) - } else { - // current impl of ArrayEquals for structs doesn't correctly handle nulls in the parent - // with a non-nullable child when comparing. Since after the round trip, the data in the - // child will have the nulls, not the original data. - ex := exChunk.Chunk(0) - tb := tblChunk.Chunk(0) - ps.Equal(ex.NullN(), tb.NullN()) - if ex.NullN() > 0 { - ps.Equal(ex.NullBitmapBytes()[:int(bitutil.BytesForBits(int64(ex.Len())))], tb.NullBitmapBytes()[:int(bitutil.BytesForBits(int64(tb.Len())))]) - } - ps.Equal(ex.Len(), tb.Len()) - // only compare the non-null values - ps.NoErrorf(bitutils.VisitSetBitRuns(ex.NullBitmapBytes(), int64(ex.Data().Offset()), int64(ex.Len()), func(pos, length int64) error { - if !ps.True(array.SliceEqual(ex, pos, pos+length, tb, pos, pos+length)) { - return errors.New("failed") - } - return nil - }), "expected: %s\ngot: %s", ex, tb) - } + exc := exChunk.Chunk(0) + tbc := tblChunk.Chunk(0) + ps.Truef(array.ApproxEqual(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc) } func makeEmptyListsArray(size int) arrow.Array {