Skip to content

Commit

Permalink
GH-35871: [Go] Account for struct validity bitmap in `array.ApproxEqu…
Browse files Browse the repository at this point in the history
…al` (#35872)

### Rationale for this change

When comparing `array.Struct` values with `array.ApproxEqual` the validity bitmap of the struct itself should take precedence:
> When reading the struct array the parent validity bitmap takes priority.

This follows a brief discussion from #35851.

### What changes are included in this PR?

`array.arrayApproxEqualStruct` will check the fields data only if the struct elem is valid.

### Are these changes tested?

`pqarrow` tests are updated accordingly (no special treatment for structs, just `array.ApproxEqual`

### Are there any user-facing changes?

`array.ApproxEqual` behavior changed to match the docs about validity bitmap precedence.

* Closes: #35871

Authored-by: candiduslynx <candiduslynx@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
  • Loading branch information
candiduslynx authored Jun 1, 2023
1 parent 3299d12 commit 61692b6
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 63 deletions.
20 changes: 15 additions & 5 deletions go/arrow/array/compare.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

"github.com/apache/arrow/go/v13/arrow"
"github.com/apache/arrow/go/v13/arrow/float16"
"github.com/apache/arrow/go/v13/internal/bitutils"
)

// RecordEqual reports whether the two provided records are equal.
Expand Down Expand Up @@ -735,13 +736,22 @@ func arrayApproxEqualFixedSizeList(left, right *FixedSizeList, opt equalOption)
}

func arrayApproxEqualStruct(left, right *Struct, opt equalOption) bool {
for i, lf := range left.fields {
rf := right.fields[i]
if !arrayApproxEqual(lf, rf, opt) {
return false
return bitutils.VisitSetBitRuns(
left.NullBitmapBytes(),
int64(left.Offset()), int64(left.Len()),
approxEqualStructRun(left, right, opt),
) == nil
}

func approxEqualStructRun(left, right *Struct, opt equalOption) bitutils.VisitFn {
return func(pos int64, length int64) error {
for i := range left.fields {
if !sliceApproxEqual(left.fields[i], pos, pos+length, right.fields[i], pos, pos+length, opt) {
return arrow.ErrInvalid
}
}
return nil
}
return true
}

// arrayApproxEqualMap doesn't care about the order of keys (in Go map traversal order is undefined)
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/array/struct.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func (a *Struct) newStructFieldWithParentValidityMask(fieldIndex int) arrow.Arra
maskedNullBitmapBytes := make([]byte, len(nullBitmapBytes))
copy(maskedNullBitmapBytes, nullBitmapBytes)
for i := 0; i < field.Len(); i++ {
if !a.IsValid(i) {
if a.IsNull(i) {
bitutil.ClearBit(maskedNullBitmapBytes, i)
}
}
Expand Down
54 changes: 22 additions & 32 deletions go/parquet/pqarrow/column_readers.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,16 +168,6 @@ func (sr *structReader) Release() {
}

func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.LevelInfo, children []*ColumnReader, props ArrowReadProperties) *ColumnReader {
// there could be a mix of children some might be repeated and some might not be
// if possible use one that isn't since that will be guaranteed to have the least
// number of levels to reconstruct a nullable bitmap
var result *ColumnReader
for _, child := range children {
if !child.IsOrHasRepeatedChild() {
result = child
}
}

ret := &structReader{
rctx: rctx,
filtered: filtered,
Expand All @@ -186,10 +176,18 @@ func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.Leve
props: props,
refCount: 1,
}
if result != nil {
ret.defRepLevelChild = result
ret.hasRepeatedChild = false
} else {

// there could be a mix of children some might be repeated and some might not be
// if possible use one that isn't since that will be guaranteed to have the least
// number of levels to reconstruct a nullable bitmap
for _, child := range children {
if !child.IsOrHasRepeatedChild() {
ret.defRepLevelChild = child
break
}
}

if ret.defRepLevelChild == nil {
ret.defRepLevelChild = children[0]
ret.hasRepeatedChild = true
}
Expand Down Expand Up @@ -250,15 +248,16 @@ func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) {

var nullBitmap *memory.Buffer

if lenBound > 0 {
if lenBound > 0 && (sr.hasRepeatedChild || sr.filtered.Nullable) {
nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
validityIO.ValidBits = nullBitmap.Bytes()
defLevels, err := sr.GetDefLevels()
if err != nil {
return nil, err
}

if sr.hasRepeatedChild {
nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
validityIO.ValidBits = nullBitmap.Bytes()
defLevels, err := sr.GetDefLevels()
if err != nil {
return nil, err
}
repLevels, err := sr.GetRepLevels()
if err != nil {
return nil, err
Expand All @@ -267,16 +266,7 @@ func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) {
if err := file.DefRepLevelsToBitmap(defLevels, repLevels, sr.levelInfo, &validityIO); err != nil {
return nil, err
}

} else if sr.filtered.Nullable {
nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
validityIO.ValidBits = nullBitmap.Bytes()
defLevels, err := sr.GetDefLevels()
if err != nil {
return nil, err
}

} else {
file.DefLevelsToBitmap(defLevels, sr.levelInfo, &validityIO)
}
}
Expand Down
66 changes: 41 additions & 25 deletions go/parquet/pqarrow/encode_arrow_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package pqarrow_test
import (
"bytes"
"context"
"errors"
"fmt"
"math"
"strconv"
Expand All @@ -32,7 +31,6 @@ import (
"github.com/apache/arrow/go/v13/arrow/decimal128"
"github.com/apache/arrow/go/v13/arrow/ipc"
"github.com/apache/arrow/go/v13/arrow/memory"
"github.com/apache/arrow/go/v13/internal/bitutils"
"github.com/apache/arrow/go/v13/internal/types"
"github.com/apache/arrow/go/v13/internal/utils"
"github.com/apache/arrow/go/v13/parquet"
Expand Down Expand Up @@ -931,6 +929,44 @@ func (ps *ParquetIOTestSuite) TestReadDecimals() {
ps.True(array.Equal(expected, chunked.Chunk(0)))
}

func (ps *ParquetIOTestSuite) TestReadNestedStruct() {
mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
defer mem.AssertSize(ps.T(), 0)

dt := arrow.StructOf(arrow.Field{
Name: "nested",
Type: arrow.StructOf(
arrow.Field{Name: "bool", Type: arrow.FixedWidthTypes.Boolean},
arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32},
arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64},
),
})
field := arrow.Field{Name: "struct", Type: dt, Nullable: true}

builder := array.NewStructBuilder(mem, dt)
defer builder.Release()
nested := builder.FieldBuilder(0).(*array.StructBuilder)

builder.Append(true)
nested.Append(true)
nested.FieldBuilder(0).(*array.BooleanBuilder).Append(true)
nested.FieldBuilder(1).(*array.Int32Builder).Append(int32(-1))
nested.FieldBuilder(2).(*array.Int64Builder).Append(int64(-2))
builder.AppendNull()

arr := builder.NewStructArray()
defer arr.Release()

expected := array.NewTable(
arrow.NewSchema([]arrow.Field{field}, nil),
[]arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(dt, []arrow.Array{arr}))},
-1,
)
defer arr.Release() // NewChunked
defer expected.Release()
ps.roundTripTable(mem, expected, true)
}

func (ps *ParquetIOTestSuite) writeColumn(mem memory.Allocator, sc *schema.GroupNode, values arrow.Array) []byte {
var buf bytes.Buffer
arrsc, err := pqarrow.FromParquet(schema.NewSchema(sc), nil, nil)
Expand Down Expand Up @@ -1032,29 +1068,9 @@ func (ps *ParquetIOTestSuite) roundTripTable(_ memory.Allocator, expected arrow.
tblChunk := tbl.Column(0).Data()

ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks()))
if exChunk.DataType().ID() != arrow.STRUCT {
exc := exChunk.Chunk(0)
tbc := tblChunk.Chunk(0)
ps.Truef(array.Equal(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc)
} else {
// current impl of ArrayEquals for structs doesn't correctly handle nulls in the parent
// with a non-nullable child when comparing. Since after the round trip, the data in the
// child will have the nulls, not the original data.
ex := exChunk.Chunk(0)
tb := tblChunk.Chunk(0)
ps.Equal(ex.NullN(), tb.NullN())
if ex.NullN() > 0 {
ps.Equal(ex.NullBitmapBytes()[:int(bitutil.BytesForBits(int64(ex.Len())))], tb.NullBitmapBytes()[:int(bitutil.BytesForBits(int64(tb.Len())))])
}
ps.Equal(ex.Len(), tb.Len())
// only compare the non-null values
ps.NoErrorf(bitutils.VisitSetBitRuns(ex.NullBitmapBytes(), int64(ex.Data().Offset()), int64(ex.Len()), func(pos, length int64) error {
if !ps.True(array.SliceEqual(ex, pos, pos+length, tb, pos, pos+length)) {
return errors.New("failed")
}
return nil
}), "expected: %s\ngot: %s", ex, tb)
}
exc := exChunk.Chunk(0)
tbc := tblChunk.Chunk(0)
ps.Truef(array.ApproxEqual(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc)
}

func makeEmptyListsArray(size int) arrow.Array {
Expand Down

0 comments on commit 61692b6

Please sign in to comment.