Skip to content

Commit

Permalink
feat(parquet): Support time32, time64, date32 & date64 (#164)
Browse files Browse the repository at this point in the history
  • Loading branch information
candiduslynx authored May 25, 2023
1 parent 93dba5d commit 6fff6fe
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 38 deletions.
37 changes: 37 additions & 0 deletions parquet/date.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package parquet

import (
"github.com/apache/arrow/go/v13/arrow"
"github.com/apache/arrow/go/v13/arrow/array"
"github.com/apache/arrow/go/v13/arrow/memory"
)

func reverseTransformDate32(arr *array.Timestamp, toTime toTimeFunc) arrow.Array {
builder := array.NewDate32Builder(memory.DefaultAllocator)

for i := 0; i < arr.Len(); i++ {
if arr.IsNull(i) {
builder.AppendNull()
continue
}

builder.Append(arrow.Date32FromTime(toTime(arr.Value(i))))
}

return builder.NewArray()
}

func reverseTransformDate64(arr *array.Timestamp, toTime toTimeFunc) arrow.Array {
builder := array.NewDate64Builder(memory.DefaultAllocator)

for i := 0; i < arr.Len(); i++ {
if arr.IsNull(i) {
builder.AppendNull()
continue
}

builder.Append(arrow.Date64FromTime(toTime(arr.Value(i))))
}

return builder.NewArray()
}
42 changes: 11 additions & 31 deletions parquet/read.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ func (*Client) Read(f ReaderAtSeeker, table *schema.Table, _ string, res chan<-
return fmt.Errorf("failed to get parquet record reader: %w", err)
}

arrowSchema := table.ToArrowSchema()
sc := table.ToArrowSchema()
for rr.Next() {
rec := rr.Record()
newRecs := convertToSingleRowRecords(arrowSchema, rec)
newRecs := convertToSingleRowRecords(sc, rec)
for _, r := range newRecs {
res <- r
}
Expand Down Expand Up @@ -69,12 +69,16 @@ func reverseTransformRecord(sc *arrow.Schema, rec arrow.Record) arrow.Record {
return array.NewRecord(sc, cols, -1)
}

func reverseTransformArray(dt arrow.DataType, col arrow.Array) arrow.Array {
switch arr := col.(type) {
func reverseTransformArray(dt arrow.DataType, arr arrow.Array) arrow.Array {
switch arr := arr.(type) {
case *array.String:
return reverseTransformFromString(dt, arr)
case *array.Timestamp:
return reverseTransformTimestamp(dt.(*arrow.TimestampType), arr)
return reverseTransformFromTimestamp(dt, arr)
case *array.Time32:
return reverseTransformTime32(dt.(*arrow.Time32Type), arr)
case *array.Time64:
return reverseTransformTime64(dt.(*arrow.Time64Type), arr)
case array.ListLike:
values := reverseTransformArray(dt.(listLikeType).Elem(), arr.ListValues())
res := array.NewListData(array.NewData(
Expand All @@ -87,10 +91,10 @@ func reverseTransformArray(dt arrow.DataType, col arrow.Array) arrow.Array {
}

if isUnsupportedType(dt) {
return reverseTransformFromString(dt, col)
return reverseTransformFromString(dt, arr)
}

return col
return arr
}

func reverseTransformFromString(dt arrow.DataType, col arrow.Array) arrow.Array {
Expand All @@ -106,27 +110,3 @@ func reverseTransformFromString(dt arrow.DataType, col arrow.Array) arrow.Array
}
return builder.NewArray()
}

func reverseTransformTimestamp(dtype *arrow.TimestampType, col *array.Timestamp) arrow.Array {
bldr := array.NewTimestampBuilder(memory.DefaultAllocator, dtype)
for i := 0; i < col.Len(); i++ {
if col.IsNull(i) {
bldr.AppendNull()
} else {
t := col.Value(i).ToTime(col.DataType().(*arrow.TimestampType).Unit)
switch dtype.Unit {
case arrow.Second:
bldr.Append(arrow.Timestamp(t.Unix()))
case arrow.Millisecond:
bldr.Append(arrow.Timestamp(t.UnixMilli()))
case arrow.Microsecond:
bldr.Append(arrow.Timestamp(t.UnixMicro()))
case arrow.Nanosecond:
bldr.Append(arrow.Timestamp(t.UnixNano()))
default:
panic(fmt.Errorf("unsupported timestamp unit: %s", dtype.Unit))
}
}
}
return bldr.NewTimestampArray()
}
91 changes: 91 additions & 0 deletions parquet/time.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package parquet

import (
"time"

"github.com/apache/arrow/go/v13/arrow"
"github.com/apache/arrow/go/v13/arrow/array"
"github.com/apache/arrow/go/v13/arrow/memory"
)

type toTimeFunc func(arrow.Timestamp) time.Time

func reverseTransformTime32(dt *arrow.Time32Type, arr *array.Time32) arrow.Array {
builder := array.NewTime32Builder(memory.DefaultAllocator, dt)

rescale := func() func(t arrow.Time32) arrow.Time32 {
switch arr.DataType().(*arrow.Time32Type).Unit {
case arrow.Second:
switch dt.Unit {
case arrow.Second:
return func(t arrow.Time32) arrow.Time32 { return t }
case arrow.Millisecond:
return func(t arrow.Time32) arrow.Time32 { return t * 1e3 }
default:
panic("unsupported time32 time unit: " + dt.Unit.String())
}
case arrow.Millisecond:
switch dt.Unit {
case arrow.Second:
return func(t arrow.Time32) arrow.Time32 { return t / 1e3 }
case arrow.Millisecond:
return func(t arrow.Time32) arrow.Time32 { return t }
default:
panic("unsupported time32 time unit: " + dt.Unit.String())
}
default:
panic("unsupported time32 time unit: " + arr.DataType().(*arrow.Time32Type).Unit.String())
}
}()

for i := 0; i < arr.Len(); i++ {
if arr.IsNull(i) {
builder.AppendNull()
continue
}

builder.Append(rescale(arr.Value(i)))
}

return builder.NewArray()
}

func reverseTransformTime64(dt *arrow.Time64Type, arr *array.Time64) arrow.Array {
builder := array.NewTime64Builder(memory.DefaultAllocator, dt)

rescale := func() func(t arrow.Time64) arrow.Time64 {
switch arr.DataType().(*arrow.Time64Type).Unit {
case arrow.Microsecond:
switch dt.Unit {
case arrow.Microsecond:
return func(t arrow.Time64) arrow.Time64 { return t }
case arrow.Nanosecond:
return func(t arrow.Time64) arrow.Time64 { return t * 1e3 }
default:
panic("unsupported time64 time unit: " + dt.Unit.String())
}
case arrow.Nanosecond:
switch dt.Unit {
case arrow.Microsecond:
return func(t arrow.Time64) arrow.Time64 { return t / 1e3 }
case arrow.Nanosecond:
return func(t arrow.Time64) arrow.Time64 { return t }
default:
panic("unsupported time64 time unit: " + dt.Unit.String())
}
default:
panic("unsupported time64 time unit: " + arr.DataType().(*arrow.Time64Type).Unit.String())
}
}()

for i := 0; i < arr.Len(); i++ {
if arr.IsNull(i) {
builder.AppendNull()
continue
}

builder.Append(rescale(arr.Value(i)))
}

return builder.NewArray()
}
43 changes: 43 additions & 0 deletions parquet/timestamp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package parquet

import (
"fmt"

"github.com/apache/arrow/go/v13/arrow"
"github.com/apache/arrow/go/v13/arrow/array"
"github.com/apache/arrow/go/v13/arrow/memory"
)

func reverseTransformTimestamp(dt *arrow.TimestampType, arr *array.Timestamp) arrow.Array {
builder := array.NewTimestampBuilder(memory.DefaultAllocator, dt)
in, out := arr.DataType().(*arrow.TimestampType).Unit, dt.Unit

for i := 0; i < arr.Len(); i++ {
if arr.IsNull(i) {
builder.AppendNull()
continue
}

builder.Append(arrow.Timestamp(arrow.ConvertTimestampValue(in, out, int64(arr.Value(i)))))
}

return builder.NewArray()
}

func reverseTransformFromTimestamp(dt arrow.DataType, arr *array.Timestamp) arrow.Array {
toTime, err := arr.DataType().(*arrow.TimestampType).GetToTimeFunc()
if err != nil {
panic(fmt.Errorf("failed GetToTimeFunc: %w", err))
}

switch dt := dt.(type) {
case *arrow.Date32Type:
return reverseTransformDate32(arr, toTime)
case *arrow.Date64Type:
return reverseTransformDate64(arr, toTime)
case *arrow.TimestampType:
return reverseTransformTimestamp(dt, arr)
default:
return reverseTransformFromString(dt, arr)
}
}
9 changes: 2 additions & 7 deletions parquet/write_read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,9 @@ import (
"github.com/cloudquery/plugin-sdk/v3/schema"
)

var pqTestOpts = schema.TestSourceOptions{
SkipTimes: true,
SkipDates: true,
}

func TestWriteRead(t *testing.T) {
var b bytes.Buffer
table := schema.TestTable("test", pqTestOpts)
table := schema.TestTable("test", schema.TestSourceOptions{})
sourceName := "test-source"
syncTime := time.Now().UTC().Round(time.Second)
opts := schema.GenTestDataOptions{
Expand Down Expand Up @@ -70,7 +65,7 @@ func TestWriteRead(t *testing.T) {
}

func BenchmarkWrite(b *testing.B) {
table := schema.TestTable("test", pqTestOpts)
table := schema.TestTable("test", schema.TestSourceOptions{})
sourceName := "test-source"
syncTime := time.Now().UTC().Round(time.Second)
opts := schema.GenTestDataOptions{
Expand Down

0 comments on commit 6fff6fe

Please sign in to comment.