From d1cbe70c58f9d2305b5c89ab3973897525d1d6ea Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Mon, 10 Apr 2023 17:03:04 +0200 Subject: [PATCH 1/5] Track the most likely next field in struct building --- Cargo.lock | 2 +- .../src/internal/generic_sinks/struct.rs | 41 +++++++++++++------ 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 138ea527..0c9d4ba0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -973,7 +973,7 @@ dependencies = [ [[package]] name = "serde_arrow" -version = "0.6.0-rc.3" +version = "0.6.0" dependencies = [ "anyhow", "arrow-array 35.0.0", diff --git a/serde_arrow/src/internal/generic_sinks/struct.rs b/serde_arrow/src/internal/generic_sinks/struct.rs index 09c2ac90..ee215c26 100644 --- a/serde_arrow/src/internal/generic_sinks/struct.rs +++ b/serde_arrow/src/internal/generic_sinks/struct.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeMap; + use crate::internal::{ error::{error, fail, Result}, event::Event, @@ -7,6 +9,7 @@ use crate::internal::{ pub struct StructArrayBuilder { pub(crate) field_meta: Vec, + pub(crate) field_lookup: BTreeMap, /// the builders of the sub arrays pub(crate) builders: Vec, /// the validity of the items @@ -19,8 +22,15 @@ pub struct StructArrayBuilder { impl StructArrayBuilder { pub fn new(field_meta: Vec, builders: Vec) -> Self { let num_columns = field_meta.len(); + + let mut field_lookup = BTreeMap::new(); + for (idx, meta) in field_meta.iter().enumerate() { + field_lookup.insert(meta.name.to_string(), idx); + } + Self { field_meta, + field_lookup, builders, validity: Vec::new(), state: StructArrayBuilderState::Start, @@ -46,7 +56,7 @@ impl StructArrayBuilder { #[derive(Debug, Clone, Copy)] pub enum StructArrayBuilderState { Start, - Field, + Field(usize), Value(usize, usize), } @@ -58,14 +68,14 @@ impl EventSink for StructArrayBuilder { Start => { if matches!(ev, Event::StartStruct | Event::StartMap) { this.seen = vec![false; this.field_meta.len()]; - Field + Field(0) } else { fail!( "Expected StartStruct, StartMap or marker in StructArrayBuilder, not {ev}" ) } } - Field => fail!("Unexpected event while waiting for field: {ev}"), + Field(_) => fail!("Unexpected event while waiting for field: {ev}"), Value(active, depth) => { next(&mut this.builders[active], val)?; Value(active, depth + 1) @@ -80,7 +90,7 @@ impl EventSink for StructArrayBuilder { Start => fail!( "Expected StartStruct, StartMap or marker in StructArrayBuilder, not {ev}" ), - Field => if matches!(ev, Event::EndStruct | Event::EndMap) { + Field(_) => if matches!(ev, Event::EndStruct | Event::EndMap) { for (idx, seen) in this.seen.iter().enumerate() { if !seen { if !this.field_meta[idx].nullable { @@ -98,7 +108,7 @@ impl EventSink for StructArrayBuilder { next(&mut this.builders[active], val)?; match depth { // the last closing event for the current value - 1 => Field, + 1 => Field(active + 1), // TODO: check is this event possible? 0 => fail!("Unbalanced opening / close events in StructArrayBuilder"), _ => Value(active, depth - 1), @@ -113,7 +123,7 @@ impl EventSink for StructArrayBuilder { this.state = match this.state { Start => Start, - Field => fail!("Unexpected event while waiting for field: {ev}"), + Field(_) => fail!("Unexpected event while waiting for field: {ev}"), Value(active, depth) => { next(&mut this.builders[active], val)?; Value(active, depth) @@ -146,18 +156,23 @@ impl EventSink for StructArrayBuilder { } Start } - Field => { + Field(best_guess) => { let key = match ev { Event::Str(key) => key, Event::OwnedStr(ref key) => key, ev => fail!("Unexpected event while waiting for field: {ev}"), }; - let idx = this - .field_meta - .iter() - .position(|m| m.name == key) - .ok_or_else(|| error!("unknown field {key}"))?; + let idx = if best_guess < this.field_meta.len() && this.field_meta[best_guess].name == key { + best_guess + } else { + this + .field_lookup + .get(key) + .copied() + .ok_or_else(|| error!("unknown field {key}"))? + }; + if this.seen[idx] { fail!("Duplicate field {}", this.field_meta[idx].name); } @@ -167,7 +182,7 @@ impl EventSink for StructArrayBuilder { Value(active, depth) => { next(&mut this.builders[active], val)?; if depth == 0 { - Field + Field(active + 1) } else { Value(active, depth) } From b1ec034304c58fbd0242cf132aea1b5f7623f3e3 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 11 Apr 2023 19:01:28 +0200 Subject: [PATCH 2/5] Add arrow=37 compat --- Cargo.lock | 168 ++++++++++++++++ serde_arrow/Cargo.toml | 22 +- serde_arrow/benches/arrow2.rs | 190 ++++++++++++++---- serde_arrow/src/arrow/schema.rs | 75 ++++--- serde_arrow/src/arrow/sinks.rs | 91 +++++---- serde_arrow/src/arrow/type_support.rs | 18 +- serde_arrow/src/arrow2/schema.rs | 10 +- serde_arrow/src/arrow2/sinks.rs | 53 ++--- .../src/internal/generic_sinks/list.rs | 8 +- serde_arrow/src/internal/generic_sinks/map.rs | 18 +- serde_arrow/src/internal/generic_sinks/mod.rs | 25 +-- .../src/internal/generic_sinks/struct.rs | 24 +-- .../src/internal/generic_sinks/tuple.rs | 8 +- .../src/internal/generic_sinks/union.rs | 10 +- serde_arrow/src/internal/schema.rs | 18 -- serde_arrow/src/lib.rs | 21 +- x.py | 43 ++-- 17 files changed, 539 insertions(+), 263 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0c9d4ba0..74171aba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -74,6 +74,22 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-array" +version = "37.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3190f208ee7aa0f3596fa0098d42911dec5e123ca88c002a08b24877ad14c71e" +dependencies = [ + "ahash", + "arrow-buffer 37.0.0", + "arrow-data 37.0.0", + "arrow-schema 37.0.0", + "chrono", + "half 2.2.1", + "hashbrown 0.13.2", + "num", +] + [[package]] name = "arrow-buffer" version = "35.0.0" @@ -94,6 +110,32 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-buffer" +version = "37.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d33c733c5b6c44a0fc526f29c09546e04eb56772a7a21e48e602f368be381f6" +dependencies = [ + "half 2.2.1", + "num", +] + +[[package]] +name = "arrow-cast" +version = "37.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abd349520b6a1ed4924ae2afc9d23330a3044319e4ec3d5b124c09e4d440ae87" +dependencies = [ + "arrow-array 37.0.0", + "arrow-buffer 37.0.0", + "arrow-data 37.0.0", + "arrow-schema 37.0.0", + "arrow-select", + "chrono", + "lexical-core", + "num", +] + [[package]] name = "arrow-data" version = "35.0.0" @@ -118,6 +160,18 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-data" +version = "37.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c8361947aaa96d331da9df3f7a08bdd8ab805a449994c97f5c4d24c4b7e2cf" +dependencies = [ + "arrow-buffer 37.0.0", + "arrow-schema 37.0.0", + "half 2.2.1", + "num", +] + [[package]] name = "arrow-format" version = "0.8.1" @@ -128,6 +182,26 @@ dependencies = [ "serde", ] +[[package]] +name = "arrow-json" +version = "37.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bf2366607be867ced681ad7f272371a5cf1fc2941328eef7b4fee14565166fb" +dependencies = [ + "arrow-array 37.0.0", + "arrow-buffer 37.0.0", + "arrow-cast", + "arrow-data 37.0.0", + "arrow-schema 37.0.0", + "chrono", + "half 2.2.1", + "indexmap", + "lexical-core", + "num", + "serde", + "serde_json", +] + [[package]] name = "arrow-schema" version = "35.0.0" @@ -140,6 +214,25 @@ version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d04f17f7b86ded0b5baf98fe6123391c4343e031acc3ccc5fa604cc180bff220" +[[package]] +name = "arrow-schema" +version = "37.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a16b88a93ac8350f0200b1cd336a1f887315925b8dd7aa145a37b8bdbd8497a4" + +[[package]] +name = "arrow-select" +version = "37.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98e8a4d6ca37d5212439b24caad4d80743fcbb706706200dd174bb98e68fe9d8" +dependencies = [ + "arrow-array 37.0.0", + "arrow-buffer 37.0.0", + "arrow-data 37.0.0", + "arrow-schema 37.0.0", + "num", +] + [[package]] name = "arrow2" version = "0.16.0" @@ -642,6 +735,70 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lexical-core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +dependencies = [ + "lexical-util", + "static_assertions", +] + [[package]] name = "libc" version = "0.2.140" @@ -978,12 +1135,17 @@ dependencies = [ "anyhow", "arrow-array 35.0.0", "arrow-array 36.0.0", + "arrow-array 37.0.0", "arrow-buffer 35.0.0", "arrow-buffer 36.0.0", + "arrow-buffer 37.0.0", "arrow-data 35.0.0", "arrow-data 36.0.0", + "arrow-data 37.0.0", + "arrow-json", "arrow-schema 35.0.0", "arrow-schema 36.0.0", + "arrow-schema 37.0.0", "arrow2 0.16.0", "arrow2 0.17.0", "chrono", @@ -1021,6 +1183,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "syn" version = "1.0.109" diff --git a/serde_arrow/Cargo.toml b/serde_arrow/Cargo.toml index b17f6852..341fa2ca 100644 --- a/serde_arrow/Cargo.toml +++ b/serde_arrow/Cargo.toml @@ -13,29 +13,35 @@ bench = false [[bench]] name = "arrow2" -required-features = ["arrow2-0-17", "arrow-36"] +required-features = ["arrow2-0-17", "arrow-37"] harness = false [package.metadata.docs.rs] -features = ["arrow2-0-17", "arrow-36"] +features = ["arrow2-0-17", "arrow-37"] [features] default = [] -arrow-35 = ["dep:arrow-array-35", "dep:arrow-schema-35", "dep:arrow-data-35", "dep:arrow-buffer-35"] +arrow-37 = ["dep:arrow-array-37", "dep:arrow-schema-37", "dep:arrow-data-37", "dep:arrow-buffer-37"] arrow-36 = ["dep:arrow-array-36", "dep:arrow-schema-36", "dep:arrow-data-36", "dep:arrow-buffer-36"] +arrow-35 = ["dep:arrow-array-35", "dep:arrow-schema-35", "dep:arrow-data-35", "dep:arrow-buffer-35"] [dependencies] -arrow-array-35 = { package = "arrow-array", version = "35", optional = true } -arrow-buffer-35 = { package = "arrow-buffer", version = "35", optional = true } -arrow-data-35 = { package = "arrow-data", version="35", optional = true } -arrow-schema-35 = { package = "arrow-schema", version = "35", optional = true } +arrow-array-37 = { package = "arrow-array", version = "37", optional = true } +arrow-buffer-37 = { package = "arrow-buffer", version = "37", optional = true } +arrow-data-37 = { package = "arrow-data", version="37", optional = true } +arrow-schema-37 = { package = "arrow-schema", version = "37", optional = true } arrow-array-36 = { package = "arrow-array", version = "36", optional = true } arrow-buffer-36 = { package = "arrow-buffer", version = "36", optional = true } arrow-data-36 = { package = "arrow-data", version="36", optional = true } arrow-schema-36 = { package = "arrow-schema", version = "36", optional = true } +arrow-array-35 = { package = "arrow-array", version = "35", optional = true } +arrow-buffer-35 = { package = "arrow-buffer", version = "35", optional = true } +arrow-data-35 = { package = "arrow-data", version="35", optional = true } +arrow-schema-35 = { package = "arrow-schema", version = "35", optional = true } + arrow2-0-16 = { package = "arrow2", version = "0.16", optional = true } arrow2-0-17 = { package = "arrow2", version = "0.17", optional = true } @@ -51,5 +57,7 @@ serde = { version = "1", features = ["derive"] } serde_json = "1" rand = "0.8" +arrow-json-37 = { package = "arrow-json", version = "37" } + # for benchmarks criterion = "0.4" diff --git a/serde_arrow/benches/arrow2.rs b/serde_arrow/benches/arrow2.rs index 300dca74..48e776cc 100644 --- a/serde_arrow/benches/arrow2.rs +++ b/serde_arrow/benches/arrow2.rs @@ -1,5 +1,7 @@ -use std::time::Duration; +use std::{sync::Arc, time::Duration}; +use arrow_json_37::RawReaderBuilder; +use arrow_schema_37::Schema; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use serde_arrow::_impl::arrow2::{ array::{ @@ -16,31 +18,28 @@ use rand::{ Rng, }; use serde::{Deserialize, Serialize}; -use serde_arrow::arrow2::{deserialize_from_arrays, serialize_into_arrays, serialize_into_fields}; +use serde_arrow::{arrow, arrow2}; -fn benchmark_primitives(c: &mut Criterion) { - let mut group = c.benchmark_group("primitives"); - group.sample_size(20); - group.sampling_mode(criterion::SamplingMode::Flat); - group.measurement_time(Duration::from_secs(60)); +mod primitives { + use super::*; #[derive(Debug, Serialize, Deserialize)] - struct Item { - a: u8, - b: u16, - c: u32, - d: u64, - e: i8, - f: i16, - g: i32, - h: i64, - i: f32, - j: f64, - k: bool, + pub struct Item { + pub a: u8, + pub b: u16, + pub c: u32, + pub d: u64, + pub e: i8, + pub f: i16, + pub g: i32, + pub h: i64, + pub i: f32, + pub j: f64, + pub k: bool, } impl Item { - fn random(rng: &mut R) -> Self { + pub fn random(rng: &mut R) -> Self { Self { a: Standard.sample(rng), b: Standard.sample(rng), @@ -56,24 +55,26 @@ fn benchmark_primitives(c: &mut Criterion) { } } } +} + +fn benchmark_serialize_arrow2_primitives(c: &mut Criterion) { + let mut group = c.benchmark_group("serialize_arrow2_primitives"); + group.sample_size(20); + group.sampling_mode(criterion::SamplingMode::Flat); + group.measurement_time(Duration::from_secs(60)); let mut rng = rand::thread_rng(); let items = (0..100_000) - .map(|_| Item::random(&mut rng)) + .map(|_| primitives::Item::random(&mut rng)) .collect::>(); - let fields = serialize_into_fields(&items, Default::default()).unwrap(); - let arrays = serialize_into_arrays(&fields, &items).unwrap(); - - group.bench_function("serialize_into_arrays", |b| { - b.iter(|| black_box(serialize_into_arrays(&fields, &items).unwrap())); - }); + let fields = arrow2::serialize_into_fields(&items, Default::default()).unwrap(); - group.bench_function("deserialize_from_arrays", |b| { - b.iter(|| black_box::>(deserialize_from_arrays(&fields, &arrays).unwrap())); + group.bench_function("serde_arrow", |b| { + b.iter(|| black_box(arrow2::serialize_into_arrays(&fields, &items).unwrap())); }); - group.bench_function("manually_serialize", |b| { + group.bench_function("manual", |b| { b.iter(|| { let mut a = MutablePrimitiveArray::::new(); let mut b = MutablePrimitiveArray::::new(); @@ -118,7 +119,31 @@ fn benchmark_primitives(c: &mut Criterion) { }) }); - group.bench_function("manually_deserialize", |b| { + group.finish(); +} + + +fn benchmark_deserialize_arrow2_primitives(c: &mut Criterion) { + let mut group = c.benchmark_group("deserialize_arrow2_primitives"); + group.sample_size(20); + group.sampling_mode(criterion::SamplingMode::Flat); + group.measurement_time(Duration::from_secs(60)); + + let mut rng = rand::thread_rng(); + + let items = (0..100_000) + .map(|_| primitives::Item::random(&mut rng)) + .collect::>(); + let fields = arrow2::serialize_into_fields(&items, Default::default()).unwrap(); + let arrays = arrow2::serialize_into_arrays(&fields, &items).unwrap(); + + group.bench_function("serde_arrow", |b| { + b.iter(|| { + black_box::>(arrow2::deserialize_from_arrays(&fields, &arrays).unwrap()) + }); + }); + + group.bench_function("manual", |b| { b.iter(|| { let mut res = Vec::new(); @@ -165,7 +190,7 @@ fn benchmark_primitives(c: &mut Criterion) { let k = arrays[10].as_any().downcast_ref::().unwrap(); for ii in 0..a.len() { - res.push(Item { + res.push(primitives::Item { a: a.value(ii), b: b.value(ii), c: c.value(ii), @@ -187,8 +212,8 @@ fn benchmark_primitives(c: &mut Criterion) { group.finish(); } -fn benchmark_complex(c: &mut Criterion) { - let mut group = c.benchmark_group("complex"); +fn benchmark_serialize_arrow2_complex(c: &mut Criterion) { + let mut group = c.benchmark_group("serialize_arrow2_complex"); group.sample_size(20); group.sampling_mode(criterion::SamplingMode::Flat); group.measurement_time(Duration::from_secs(60)); @@ -233,18 +258,13 @@ fn benchmark_complex(c: &mut Criterion) { let items = (0..100_000) .map(|_| Item::random(&mut rng)) .collect::>(); - let fields = serialize_into_fields(&items, Default::default()).unwrap(); - let arrays = serialize_into_arrays(&fields, &items).unwrap(); + let fields = arrow2::serialize_into_fields(&items, Default::default()).unwrap(); - group.bench_function("serialize_into_arrays", |b| { - b.iter(|| black_box(serialize_into_arrays(&fields, &items).unwrap())); + group.bench_function("serde_arrow", |b| { + b.iter(|| black_box(arrow2::serialize_into_arrays(&fields, &items).unwrap())); }); - group.bench_function("deserialize_from_arrays", |b| { - b.iter(|| black_box::>(deserialize_from_arrays(&fields, &arrays).unwrap())); - }); - - group.bench_function("manually_serialize", |b| { + group.bench_function("manual", |b| { b.iter(|| { let mut string = MutableUtf8Array::::new(); let mut points_0 = MutablePrimitiveArray::::new(); @@ -315,5 +335,87 @@ fn benchmark_complex(c: &mut Criterion) { group.finish(); } -criterion_group!(benches, benchmark_primitives, benchmark_complex); +/// a simplified benchmark that is supported by arrow +fn benchmark_serialize_arrow_complex(c: &mut Criterion) { + let mut group = c.benchmark_group("serialize_arrow_complex"); + group.sample_size(20); + group.sampling_mode(criterion::SamplingMode::Flat); + group.measurement_time(Duration::from_secs(60)); + + #[derive(Debug, Serialize, Deserialize)] + struct Item { + string: String, + points: Vec, + child: SubItem, + } + + #[derive(Debug, Serialize, Deserialize)] + struct Point { + x: f32, + y: f32, + } + + #[derive(Debug, Serialize, Deserialize)] + struct SubItem { + a: bool, + b: f64, + c: Option, + } + + impl Item { + fn random(rng: &mut R) -> Self { + let n_string = Uniform::new(1, 50).sample(rng); + let n_points = Uniform::new(1, 50).sample(rng); + + Self { + string: (0..n_string) + .map(|_| -> char { Standard.sample(rng) }) + .collect(), + points: (0..n_points) + .map(|_| Point { + x: Standard.sample(rng), + y: Standard.sample(rng), + }) + .collect(), + child: SubItem { + a: Standard.sample(rng), + b: Standard.sample(rng), + c: Standard.sample(rng), + }, + } + } + } + + let mut rng = rand::thread_rng(); + + let items = (0..100_000) + .map(|_| Item::random(&mut rng)) + .collect::>(); + let fields = arrow::serialize_into_fields(&items, Default::default()).unwrap(); + + group.bench_function("arrow", |b| { + b.iter(|| { + let schema = Schema::new(fields.clone()); + let mut decoder = RawReaderBuilder::new(Arc::new(schema)) + .build_decoder() + .unwrap(); + decoder.serialize(&items).unwrap(); + black_box(decoder.flush().unwrap().unwrap()); + }); + }); + + group.bench_function("serde_arrow", |b| { + b.iter(|| black_box(arrow::serialize_into_arrays(&fields, &items).unwrap())); + }); + + group.finish(); +} + +criterion_group!( + benches, + benchmark_serialize_arrow2_primitives, + benchmark_deserialize_arrow2_primitives, + benchmark_serialize_arrow2_complex, + benchmark_serialize_arrow_complex, +); criterion_main!(benches); diff --git a/serde_arrow/src/arrow/schema.rs b/serde_arrow/src/arrow/schema.rs index 82ec4e72..662ad3bc 100644 --- a/serde_arrow/src/arrow/schema.rs +++ b/serde_arrow/src/arrow/schema.rs @@ -1,8 +1,9 @@ +use super::type_support::FieldRef; use crate::{ _impl::arrow::datatypes::{DataType, Field, UnionMode}, internal::{ error::{error, fail, Error, Result}, - schema::{FieldMeta, GenericDataType, GenericField, Strategy, STRATEGY_KEY}, + schema::{GenericDataType, GenericField, Strategy, STRATEGY_KEY}, }, }; @@ -55,7 +56,7 @@ impl TryFrom<&Field> for GenericField { } DataType::Struct(fields) => { for field in fields { - children.push(field.try_into()?); + children.push(field.as_field_ref().try_into()?); } GenericDataType::Struct } @@ -63,6 +64,21 @@ impl TryFrom<&Field> for GenericField { children.push(field.as_ref().try_into()?); GenericDataType::Map } + #[cfg(feature = "arrow-37")] + DataType::Union(fields, mode) => { + if !matches!(mode, UnionMode::Dense) { + fail!("Only dense unions are supported at the moment"); + } + + for (pos, (idx, field)) in fields.iter().enumerate() { + if pos as i8 != idx { + fail!("Union types with explicit field indices are not supported"); + } + children.push(field.as_ref().try_into()?); + } + GenericDataType::Union + } + #[cfg(not(feature = "arrow-37"))] DataType::Union(fields, field_indices, mode) => { if field_indices .iter() @@ -123,26 +139,32 @@ impl TryFrom<&GenericField> for Field { GenericDataType::Date64 => DataType::Date64, GenericDataType::Utf8 => DataType::Utf8, GenericDataType::LargeUtf8 => DataType::LargeUtf8, - GenericDataType::List => DataType::List(Box::new( - value - .children - .get(0) - .ok_or_else(|| error!("List must a single child"))? - .try_into()?, - )), - GenericDataType::LargeList => DataType::LargeList(Box::new( - value - .children - .get(0) - .ok_or_else(|| error!("List must a single child"))? - .try_into()?, - )), + GenericDataType::List => DataType::List( + Box::::new( + value + .children + .get(0) + .ok_or_else(|| error!("List must a single child"))? + .try_into()?, + ) + .into(), + ), + GenericDataType::LargeList => DataType::LargeList( + Box::::new( + value + .children + .get(0) + .ok_or_else(|| error!("List must a single child"))? + .try_into()?, + ) + .into(), + ), GenericDataType::Struct => DataType::Struct( value .children .iter() .map(Field::try_from) - .collect::>>()?, + .collect::>()?, ), GenericDataType::Map => { let element_field: Field = value @@ -150,8 +172,17 @@ impl TryFrom<&GenericField> for Field { .get(0) .ok_or_else(|| error!("Map must a single child"))? .try_into()?; - DataType::Map(Box::new(element_field), false) + DataType::Map(Box::new(element_field).into(), false) } + #[cfg(feature = "arrow-37")] + GenericDataType::Union => { + let mut fields = Vec::new(); + for (idx, field) in value.children.iter().enumerate() { + fields.push((idx as i8, std::sync::Arc::new(Field::try_from(field)?))); + } + DataType::Union(fields.into_iter().collect(), UnionMode::Dense) + } + #[cfg(not(feature = "arrow-37"))] GenericDataType::Union => DataType::Union( value .children @@ -199,11 +230,3 @@ impl TryFrom<&GenericField> for Field { Ok(field) } } - -impl FieldMeta { - pub fn to_arrow(&self, data_type: &DataType) -> Field { - let mut field = Field::new(self.name.to_string(), data_type.clone(), self.nullable); - field.set_metadata(self.strategy.clone().map(|s| s.into()).unwrap_or_default()); - field - } -} diff --git a/serde_arrow/src/arrow/sinks.rs b/serde_arrow/src/arrow/sinks.rs index bf44e0f8..e3d2ea3b 100644 --- a/serde_arrow/src/arrow/sinks.rs +++ b/serde_arrow/src/arrow/sinks.rs @@ -2,13 +2,12 @@ use crate::{ _impl::arrow::{ array::{ self, Array, ArrayData, ArrowPrimitiveType, BooleanBufferBuilder, BooleanBuilder, - GenericListArray, GenericStringBuilder, NullArray, OffsetSizeTrait, PrimitiveBuilder, - StructArray, + GenericStringBuilder, NullArray, OffsetSizeTrait, PrimitiveBuilder, StructArray, }, buffer::Buffer, datatypes::{ - DataType, Date64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, - Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, UnionMode, + DataType, Date64Type, Field, Float16Type, Float32Type, Float64Type, Int16Type, + Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }, }, internal::{ @@ -18,11 +17,13 @@ use crate::{ DictionaryUtf8ArrayBuilder, ListArrayBuilder, MapArrayBuilder, NullArrayBuilder, PrimitiveBuilders, StructArrayBuilder, TupleStructBuilder, UnionArrayBuilder, }, - schema::FieldMeta, + schema::GenericField, sink::{macros, ArrayBuilder, DynamicArrayBuilder, EventSink}, }, }; +use super::type_support::FieldRef; + pub struct ArrowPrimitiveBuilders; impl PrimitiveBuilders for ArrowPrimitiveBuilders { @@ -390,7 +391,7 @@ impl ArrayBuilder for Utf8ArrayBuilder { } fn build_struct_array( - field_meta: &[FieldMeta], + field: &GenericField, builders: &mut [B], validity: &mut Vec, ) -> Result @@ -401,16 +402,12 @@ where let len = validity.len(); let validity = build_null_bit_buffer(validity); - let mut fields = Vec::new(); let mut data = Vec::new(); - - for (i, builder) in builders.iter_mut().enumerate() { - let arr = builder.build_array()?; - fields.push(field_meta[i].to_arrow(arr.data_type())); - data.push(arr); + for builder in builders { + data.push(builder.build_array()?); } - Ok(ArrayData::builder(DataType::Struct(fields)) + Ok(ArrayData::builder(field_to_datatype(field)?) .len(len) .null_bit_buffer(Some(validity)) .child_data(data) @@ -423,7 +420,7 @@ impl> ArrayBuilder for StructArrayBuilder< fail!("Cannot build array from unfinished StructArrayBuilder"); } - build_struct_array(&self.field_meta, &mut self.builders, &mut self.validity) + build_struct_array(&self.field, &mut self.builders, &mut self.validity) } } @@ -433,7 +430,7 @@ impl> ArrayBuilder for TupleStructBuilder< fail!("Cannot build array from unfinished TupleStructBuilder"); } - build_struct_array(&self.field_meta, &mut self.builders, &mut self.validity) + build_struct_array(&self.field, &mut self.builders, &mut self.validity) } } @@ -454,23 +451,10 @@ impl> ArrayBuilder for UnionArrayBuilder, O: OffsetSizeTrait>( let null_bit_buffer = build_null_bit_buffer(validity); let offset_buffer = Buffer::from_vec(offsets); - let field = Box::new(this.field_meta.to_arrow(values.data_type())); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(field); - let array_data_builder = ArrayData::builder(data_type) + let array_data_builder = ArrayData::builder(field_to_datatype(&this.field)?) .len(len) .add_buffer(offset_buffer) .add_child_data(values) @@ -546,20 +528,32 @@ impl> ArrayBuilder for MapArrayBuilder let validity = std::mem::take(&mut self.validity); let validity = build_null_bit_buffer(validity); - let inner = StructArray::from(vec![ - ( - self.key_meta.to_arrow(keys.data_type()), - array::make_array(keys), - ), - ( - self.val_meta.to_arrow(values.data_type()), - array::make_array(values), - ), - ]); + let keys = array::make_array(keys); + let values = array::make_array(values); - let data_type = DataType::Map(Box::new(self.field_meta.to_arrow(inner.data_type())), false); + let dtype = field_to_datatype(&self.field)?; - let res = ArrayData::builder(data_type) + let inner_field = match &dtype { + DataType::Map(inner, _) => inner.as_field_ref(), + _ => fail!("Invalid datatype during map construction"), + }; + + let (key_field, val_field) = match inner_field.data_type() { + DataType::Struct(entries) => { + if entries.len() != 2 { + fail!("Invalid number of fields in map dtype") + } + ( + entries[0].as_field_ref().clone(), + entries[1].as_field_ref().clone(), + ) + } + _ => fail!("Invalid datatype during map construction"), + }; + + let inner = StructArray::from(vec![(key_field, keys), (val_field, values)]); + + let res = ArrayData::builder(dtype) .len(len) .add_buffer(offsets) .add_child_data(inner.into_data()) @@ -587,3 +581,8 @@ impl> ArrayBuilder for DictionaryUtf8Array Ok(res) } } + +fn field_to_datatype(field: &GenericField) -> Result { + let field: Field = field.try_into()?; + Ok(field.data_type().clone()) +} diff --git a/serde_arrow/src/arrow/type_support.rs b/serde_arrow/src/arrow/type_support.rs index b404d5ca..dce9f2c6 100644 --- a/serde_arrow/src/arrow/type_support.rs +++ b/serde_arrow/src/arrow/type_support.rs @@ -1,4 +1,4 @@ -use crate::_impl::arrow::error::ArrowError; +use crate::_impl::arrow::{datatypes::Field, error::ArrowError}; use crate::internal::error::Error; @@ -7,3 +7,19 @@ impl From for Error { Self::custom(err.to_string()) } } + +pub trait FieldRef { + fn as_field_ref(&self) -> &Field; +} + +impl FieldRef for Field { + fn as_field_ref(&self) -> &Field { + self + } +} + +impl FieldRef for std::sync::Arc { + fn as_field_ref(&self) -> &Field { + self.as_ref() + } +} diff --git a/serde_arrow/src/arrow2/schema.rs b/serde_arrow/src/arrow2/schema.rs index 48965813..2c6b6e64 100644 --- a/serde_arrow/src/arrow2/schema.rs +++ b/serde_arrow/src/arrow2/schema.rs @@ -3,7 +3,7 @@ use crate::{ arrow2::display, internal::{ error::{error, fail, Error, Result}, - schema::{FieldMeta, GenericDataType, GenericField, Strategy, STRATEGY_KEY}, + schema::{GenericDataType, GenericField, Strategy, STRATEGY_KEY}, }, }; @@ -315,11 +315,3 @@ impl TryFrom<&GenericField> for Field { Ok(field) } } - -impl FieldMeta { - pub fn to_arrow2(&self, data_type: &DataType) -> Field { - let mut field = Field::new(self.name.to_string(), data_type.clone(), self.nullable); - field.metadata = self.strategy.clone().map(|s| s.into()).unwrap_or_default(); - field - } -} diff --git a/serde_arrow/src/arrow2/sinks.rs b/serde_arrow/src/arrow2/sinks.rs index d21adb92..536b408a 100644 --- a/serde_arrow/src/arrow2/sinks.rs +++ b/serde_arrow/src/arrow2/sinks.rs @@ -7,7 +7,7 @@ use crate::{ UnionArray, Utf8Array, }, bitmap::Bitmap, - datatypes::{DataType, IntegerType, UnionMode}, + datatypes::{DataType, Field, IntegerType}, offset::OffsetsBuffer, types::{f16, Offset}, }, @@ -19,6 +19,7 @@ use crate::{ TupleStructBuilder, UnionArrayBuilder, }, generic_sinks::{NullArrayBuilder, PrimitiveBuilders}, + schema::GenericField, sink::{macros, ArrayBuilder, DynamicArrayBuilder, EventSink}, }, }; @@ -108,14 +109,8 @@ impl>> ArrayBuilder> for StructArr self.builders.iter_mut().map(|b| b.build_array()).collect(); let values = values?; - let mut fields = Vec::new(); - for (i, value) in values.iter().enumerate() { - fields.push(self.field_meta[i].to_arrow2(value.data_type())); - } - let data_type = DataType::Struct(fields); - Ok(Box::new(StructArray::new( - data_type, + field_to_datatype(&self.field)?, values, Some(std::mem::take(&mut self.validity).into()), ))) @@ -132,14 +127,8 @@ impl>> ArrayBuilder> for TupleStru self.builders.iter_mut().map(|b| b.build_array()).collect(); let values = values?; - let mut fields = Vec::new(); - for (i, value) in values.iter().enumerate() { - fields.push(self.field_meta[i].to_arrow2(value.data_type())); - } - let data_type = DataType::Struct(fields); - Ok(Box::new(StructArray::new( - data_type, + field_to_datatype(&self.field)?, values, Some(std::mem::take(&mut self.validity).into()), ))) @@ -159,14 +148,8 @@ impl>> ArrayBuilder> for UnionArra .collect(); let values = values?; - let mut fields = Vec::new(); - for (i, value) in values.iter().enumerate() { - fields.push(self.field_meta[i].to_arrow2(value.data_type())); - } - let data_type = DataType::Union(fields, None, UnionMode::Dense); - Ok(Box::new(UnionArray::new( - data_type, + field_to_datatype(&self.field)?, self.field_types.clone().into(), values, Some(std::mem::take(&mut self.field_offsets).into()), @@ -182,7 +165,7 @@ impl>> ArrayBuilder> for ListArray let values = self.builder.build_array()?; let array = ListArray::try_new( - DataType::List(Box::new(self.field_meta.to_arrow2(values.data_type()))), + field_to_datatype(&self.field)?, OffsetsBuffer::try_from(std::mem::take(&mut self.offsets))?, values, Some(Bitmap::from(std::mem::take(&mut self.validity))), @@ -199,7 +182,7 @@ impl>> ArrayBuilder> for ListArray let values = self.builder.build_array()?; let array = ListArray::try_new( - DataType::LargeList(Box::new(self.field_meta.to_arrow2(values.data_type()))), + field_to_datatype(&self.field)?, OffsetsBuffer::try_from(std::mem::take(&mut self.offsets))?, values, Some(Bitmap::from(std::mem::take(&mut self.validity))), @@ -218,19 +201,18 @@ impl>> ArrayBuilder> for MapArrayB let keys = self.key_builder.build_array()?; let vals = self.val_builder.build_array()?; - // TODO: fix nullability of different fields - let entries_type = DataType::Struct(vec![ - self.key_meta.to_arrow2(keys.data_type()), - self.val_meta.to_arrow2(vals.data_type()), - ]); + let dtype = field_to_datatype(&self.field)?; - let entries = StructArray::try_new(entries_type.clone(), vec![keys, vals], None)?; - let entries: Box = Box::new(entries); + let entries_field = match dtype { + DataType::Map(inner, _) => inner.as_ref().clone(), + _ => fail!("Invalid data type during struct construction"), + }; - let map_type = DataType::Map(Box::new(self.field_meta.to_arrow2(&entries_type)), false); + let entries = StructArray::try_new(entries_field.data_type, vec![keys, vals], None)?; + let entries: Box = Box::new(entries); let array = MapArray::try_new( - map_type, + field_to_datatype(&self.field)?, OffsetsBuffer::try_from(std::mem::take(&mut self.offsets))?, entries, Some(std::mem::take(&mut self.validity).into()), @@ -541,3 +523,8 @@ where } } } + +fn field_to_datatype(field: &GenericField) -> Result { + let field: Field = field.try_into()?; + Ok(field.data_type) +} diff --git a/serde_arrow/src/internal/generic_sinks/list.rs b/serde_arrow/src/internal/generic_sinks/list.rs index f419ac50..6d5177a6 100644 --- a/serde_arrow/src/internal/generic_sinks/list.rs +++ b/serde_arrow/src/internal/generic_sinks/list.rs @@ -1,12 +1,12 @@ use crate::internal::{ error::{fail, Error, Result}, event::Event, - schema::FieldMeta, + schema::GenericField, sink::{macros, EventSink}, }; pub struct ListArrayBuilder { - pub field_meta: FieldMeta, + pub field: GenericField, pub builder: B, next: ListBuilderState, pub offsets: Vec, @@ -15,9 +15,9 @@ pub struct ListArrayBuilder { } impl ListArrayBuilder { - pub fn new(field_meta: FieldMeta, builder: B) -> Self { + pub fn new(field: GenericField, builder: B) -> Self { Self { - field_meta, + field, builder, next: ListBuilderState::Start { offset: 0 }, offsets: vec![Default::default()], diff --git a/serde_arrow/src/internal/generic_sinks/map.rs b/serde_arrow/src/internal/generic_sinks/map.rs index 5a653010..652f1883 100644 --- a/serde_arrow/src/internal/generic_sinks/map.rs +++ b/serde_arrow/src/internal/generic_sinks/map.rs @@ -1,16 +1,14 @@ use crate::internal::{ error::{fail, Result}, event::Event, - schema::FieldMeta, + schema::GenericField, sink::{macros, EventSink}, }; pub struct MapArrayBuilder { next: MapBuilderState, - pub field_meta: FieldMeta, - pub key_meta: FieldMeta, + pub field: GenericField, pub key_builder: B, - pub val_meta: FieldMeta, pub val_builder: B, pub offsets: Vec, pub offset: i32, @@ -26,18 +24,10 @@ enum MapBuilderState { } impl MapArrayBuilder { - pub fn new( - field_meta: FieldMeta, - key_meta: FieldMeta, - key_builder: B, - val_meta: FieldMeta, - val_builder: B, - ) -> Self { + pub fn new(field: GenericField, key_builder: B, val_builder: B) -> Self { Self { - field_meta, - key_meta, + field, key_builder, - val_meta, val_builder, next: MapBuilderState::Start, offsets: vec![0], diff --git a/serde_arrow/src/internal/generic_sinks/mod.rs b/serde_arrow/src/internal/generic_sinks/mod.rs index 23426476..61e98097 100644 --- a/serde_arrow/src/internal/generic_sinks/mod.rs +++ b/serde_arrow/src/internal/generic_sinks/mod.rs @@ -60,15 +60,15 @@ where ListArrayBuilder, i32>: ArrayBuilder, ListArrayBuilder, i64>: ArrayBuilder, { - let mut field_meta = Vec::new(); let mut builders = Vec::new(); - for field in fields { - field_meta.push(field.into()); builders.push(build_array_builder::(field)?); } - Ok(StructArrayBuilder::new(field_meta, builders)) + let mut field = GenericField::new("dummy", GenericDataType::Struct, true); + field.children = fields.to_vec(); + + Ok(StructArrayBuilder::new(field, builders)) } pub fn build_array_builder( @@ -120,20 +120,18 @@ where .iter() .map(build_array_builder::) .collect::>>()?; - let field_meta = field.children.iter().map(|f| f.into()).collect(); - let builder = TupleStructBuilder::new(field_meta, builders); + let builder = TupleStructBuilder::new(field.clone(), builders); Ok(DynamicArrayBuilder::new(builder)) } None | Some(Strategy::MapAsStruct) => { - let field_meta = field.children.iter().map(|f| f.into()).collect(); let builders = field .children .iter() .map(build_array_builder::) .collect::>>()?; - let builder = StructArrayBuilder::new(field_meta, builders); + let builder = StructArrayBuilder::new(field.clone(), builders); Ok(DynamicArrayBuilder::new(builder)) } Some(strategy) => fail!("Invalid strategy {strategy} for type Struct"), @@ -144,9 +142,8 @@ where .iter() .map(build_array_builder::) .collect::>>()?; - let meta = field.children.iter().map(|f| f.into()).collect(); - let builder = UnionArrayBuilder::new(meta, builders, field.nullable); + let builder = UnionArrayBuilder::new(field.clone(), builders); Ok(DynamicArrayBuilder::new(builder)) } Dictionary => { @@ -185,10 +182,8 @@ where .ok_or_else(|| error!("Dictionary entries must have key, value children"))?; let builder = MapArrayBuilder::new( - entries.into(), - key.into(), + field.clone(), build_array_builder::(key)?, - value.into(), build_array_builder::(value)?, ); Ok(DynamicArrayBuilder::new(builder)) @@ -202,12 +197,12 @@ where if let List = ty { Ok(DynamicArrayBuilder::new(ListArrayBuilder::<_, i32>::new( - child.into(), + field.clone(), values, ))) } else { Ok(DynamicArrayBuilder::new(ListArrayBuilder::<_, i64>::new( - child.into(), + field.clone(), values, ))) } diff --git a/serde_arrow/src/internal/generic_sinks/struct.rs b/serde_arrow/src/internal/generic_sinks/struct.rs index ee215c26..16410c5a 100644 --- a/serde_arrow/src/internal/generic_sinks/struct.rs +++ b/serde_arrow/src/internal/generic_sinks/struct.rs @@ -3,12 +3,12 @@ use std::collections::BTreeMap; use crate::internal::{ error::{error, fail, Result}, event::Event, - schema::FieldMeta, + schema::GenericField, sink::{macros, ArrayBuilder, EventSink}, }; pub struct StructArrayBuilder { - pub(crate) field_meta: Vec, + pub(crate) field: GenericField, pub(crate) field_lookup: BTreeMap, /// the builders of the sub arrays pub(crate) builders: Vec, @@ -20,16 +20,16 @@ pub struct StructArrayBuilder { } impl StructArrayBuilder { - pub fn new(field_meta: Vec, builders: Vec) -> Self { - let num_columns = field_meta.len(); + pub fn new(field: GenericField, builders: Vec) -> Self { + let num_columns = field.children.len(); let mut field_lookup = BTreeMap::new(); - for (idx, meta) in field_meta.iter().enumerate() { - field_lookup.insert(meta.name.to_string(), idx); + for (idx, child) in field.children.iter().enumerate() { + field_lookup.insert(child.name.to_string(), idx); } Self { - field_meta, + field, field_lookup, builders, validity: Vec::new(), @@ -67,7 +67,7 @@ impl EventSink for StructArrayBuilder { this.state = match this.state { Start => { if matches!(ev, Event::StartStruct | Event::StartMap) { - this.seen = vec![false; this.field_meta.len()]; + this.seen = vec![false; this.field_lookup.len()]; Field(0) } else { fail!( @@ -93,8 +93,8 @@ impl EventSink for StructArrayBuilder { Field(_) => if matches!(ev, Event::EndStruct | Event::EndMap) { for (idx, seen) in this.seen.iter().enumerate() { if !seen { - if !this.field_meta[idx].nullable { - fail!("Missing field {} is not nullable", this.field_meta[idx].name); + if !this.field.children[idx].nullable { + fail!("Missing field {} is not nullable", this.field.children[idx].name); } this.builders[idx].accept_null()?; } @@ -163,7 +163,7 @@ impl EventSink for StructArrayBuilder { ev => fail!("Unexpected event while waiting for field: {ev}"), }; - let idx = if best_guess < this.field_meta.len() && this.field_meta[best_guess].name == key { + let idx = if best_guess < this.field.children.len() && this.field.children[best_guess].name == key { best_guess } else { this @@ -174,7 +174,7 @@ impl EventSink for StructArrayBuilder { }; if this.seen[idx] { - fail!("Duplicate field {}", this.field_meta[idx].name); + fail!("Duplicate field {}", this.field.children[idx].name); } this.seen[idx] = true; Value(idx, 0) diff --git a/serde_arrow/src/internal/generic_sinks/tuple.rs b/serde_arrow/src/internal/generic_sinks/tuple.rs index 19c38bc7..88bc3529 100644 --- a/serde_arrow/src/internal/generic_sinks/tuple.rs +++ b/serde_arrow/src/internal/generic_sinks/tuple.rs @@ -1,12 +1,12 @@ use crate::internal::{ error::{fail, Result}, event::Event, - schema::FieldMeta, + schema::GenericField, sink::{macros, EventSink}, }; pub struct TupleStructBuilder { - pub(crate) field_meta: Vec, + pub(crate) field: GenericField, pub(crate) builders: Vec, pub(crate) validity: Vec, pub(crate) state: TupleArrayBuilderState, @@ -14,9 +14,9 @@ pub struct TupleStructBuilder { } impl TupleStructBuilder { - pub fn new(field_meta: Vec, builders: Vec) -> Self { + pub fn new(field: GenericField, builders: Vec) -> Self { Self { - field_meta, + field, builders, validity: Vec::new(), state: TupleArrayBuilderState::Start, diff --git a/serde_arrow/src/internal/generic_sinks/union.rs b/serde_arrow/src/internal/generic_sinks/union.rs index 461bdb0b..4ade4ab4 100644 --- a/serde_arrow/src/internal/generic_sinks/union.rs +++ b/serde_arrow/src/internal/generic_sinks/union.rs @@ -1,14 +1,13 @@ use crate::internal::{ error::{fail, Result}, event::Event, - schema::FieldMeta, + schema::GenericField, sink::{macros, EventSink}, }; pub struct UnionArrayBuilder { next: UnionBuilderState, - pub field_meta: Vec, - pub nullable: bool, + pub field: GenericField, pub current_field_offsets: Vec, pub field_builders: Vec, @@ -18,11 +17,10 @@ pub struct UnionArrayBuilder { } impl UnionArrayBuilder { - pub fn new(field_meta: Vec, field_builders: Vec, nullable: bool) -> Self { + pub fn new(field: GenericField, field_builders: Vec) -> Self { let current_field_offsets = vec![0; field_builders.len()]; Self { - field_meta, - nullable, + field, next: UnionBuilderState::Inactive, current_field_offsets, diff --git a/serde_arrow/src/internal/schema.rs b/serde_arrow/src/internal/schema.rs index faa6cea4..bce68a0b 100644 --- a/serde_arrow/src/internal/schema.rs +++ b/serde_arrow/src/internal/schema.rs @@ -194,24 +194,6 @@ impl GenericField { } } -// Any non-dtype related info about a field -#[derive(Debug, Clone)] -pub struct FieldMeta { - pub name: String, - pub nullable: bool, - pub strategy: Option, -} - -impl From<&GenericField> for FieldMeta { - fn from(value: &GenericField) -> Self { - Self { - name: value.name.to_string(), - nullable: value.nullable, - strategy: value.strategy.clone(), - } - } -} - /// Configure how the schema is traced /// /// Example: diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 8b2d8b60..8165f136 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -115,6 +115,7 @@ //! //! | Feature | Arrow Version | //! |---------------|---------------| +//! | `arrow-37` | `arrow=37` | //! | `arrow-36` | `arrow=36` | //! | `arrow-35` | `arrow=35` | //! | `arrow2-0-17` | `arrow2=0.17` | @@ -177,7 +178,15 @@ pub mod _impl { }; } - #[cfg(feature = "arrow-36")] + #[cfg(feature = "arrow-37")] + build_arrow_crate!( + arrow_array_37, + arrow_buffer_37, + arrow_data_37, + arrow_schema_37 + ); + + #[cfg(all(feature = "arrow-36", not(feature = "arrow-37")))] build_arrow_crate!( arrow_array_36, arrow_buffer_36, @@ -185,7 +194,11 @@ pub mod _impl { arrow_schema_36 ); - #[cfg(all(feature = "arrow-35", not(feature = "arrow-36")))] + #[cfg(all( + feature = "arrow-35", + not(feature = "arrow-36"), + not(feature = "arrow-37") + ))] build_arrow_crate!( arrow_array_35, arrow_buffer_35, @@ -211,12 +224,12 @@ pub mod _impl { #[cfg(any(feature = "arrow2-0-17", feature = "arrow2-0-16"))] pub mod arrow2; -#[cfg(any(feature = "arrow-36", feature = "arrow-35"))] +#[cfg(any(feature = "arrow-36", feature = "arrow-35", feature = "arrow-37"))] pub mod arrow; #[cfg(all( test, - any(feature = "arrow-36", feature = "arrow-35"), + any(feature = "arrow-36", feature = "arrow-35", feature = "arrow-37"), any(feature = "arrow2-0-17", feature = "arrow2-0-16") ))] mod test_impls; diff --git a/x.py b/x.py index f0ef9f9c..6bdea5be 100644 --- a/x.py +++ b/x.py @@ -1,4 +1,5 @@ import argparse +import itertools as it import json import os import pathlib @@ -15,7 +16,7 @@ arg = lambda *a, **k: _md(lambda f: _as(f).insert(0, (a, k))) -all_arrow_features = ["arrow-35", "arrow-36"] +all_arrow_features = ["arrow-35", "arrow-36", "arrow-37"] all_arrow2_features = ["arrow2-0-16", "arrow2-0-17"] default_features = f"{all_arrow2_features[-1]},{all_arrow_features[-1]}" @@ -139,28 +140,30 @@ def summarize_bench(): median_times = {k: statistics.median(v) for k, v in grouped_times.items()} - print(f"{'':23s} serde_arrow manual ratio") - for group in ["complex", "primitives"]: - for op_label, serde_arrow_key, manual_key in [ - ("serialize", "serialize_into_arrays", "manually_serialize") - ]: - serde_arrow_time = median_times[group, serde_arrow_key] - manual_time = median_times[group, manual_key] - label = f"{op_label}({group})" - print( - f"{label:23s} " - f"{1000 * serde_arrow_time:9.1f}ms " - f"{1000 * manual_time:6.1f}ms " - f"{serde_arrow_time / manual_time:6.1f}x" + for group in sorted({g for g, _ in median_times}): + print("# ", group) + times_in_group = {n: v for (g, n), v in median_times.items() if g == group} + + rows = [["label", "time [ms]", *sorted(k[:15] for k in times_in_group)]] + + for label, time in sorted(times_in_group.items()): + rows.append( + [ + label, + f"{1000 * time:7.2f}", + *(f"{time / cmp:.2f}" for _, cmp in sorted(times_in_group.items())), + ] ) - print() - print() + widths = [max(len(row[i]) for row in rows) for i in range(len(rows[0]))] + for row in rows: + padded_row = [ + (str.ljust if idx == 0 else str.rjust)(item, width) + for idx, item, width in zip(it.count(), row, widths) + ] + print(" ".join(padded_row)) - print("# raw times") - for (g, n), v in sorted(median_times.items()): - label = f"{g}, {n}" - print(f"{label:40s} {1000 * v:8.1f}ms") + print() def collect(kv_pairs): From 09fcbc4c293d6c6a70dec6c345f34be24871797f Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 11 Apr 2023 19:01:36 +0200 Subject: [PATCH 3/5] Document related packages --- Readme.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Readme.md b/Readme.md index cc91cbb5..d23fe2d2 100644 --- a/Readme.md +++ b/Readme.md @@ -4,6 +4,7 @@ | [[API docs]](https://docs.rs/serde_arrow/latest/serde_arrow/) | [Changes](Changes.md) | [Example](#example) +| [Related packages](#related-packages) | [Performance](#performance) | [How does it work?](serde_arrow/Implementation.md) | [Status](serde_arrow/Status.md) @@ -100,6 +101,19 @@ building the arrays can be expected. More complex types incur a smaller performance penalty. See the [benches](serde_arrow/benches/arrow2.rs) for details. +## Related packages + +- [`arrow`][arrow]: the JSON component of the official Arrow package supports + serializing objects that support serialize via the [RawDecoder][raw-decoder] + object. It supports primitives types, structs and lists +- [`arrow2-convert`][arrow2-convert]: adds derive macros to convert objects from + and to arrow2 arrays. It supports primitive types, structs, lists, and + chrono's date time types. Enum support is experimental according to the + Readme + +[raw-decoder]: https://docs.rs/arrow-json/37.0.0/arrow_json/struct.RawDecoder.html#method.serialize +[arrow2-convert]: https://github.com/DataEngineeringLabs/arrow2-convert + ## Development All common tasks are bundled in the `x.py` script: From 2a45d491eac905bd49fada3af6e1a199d5125b88 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 11 Apr 2023 19:26:06 +0200 Subject: [PATCH 4/5] Add changes --- Changes.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Changes.md b/Changes.md index c25e9e00..1d616f19 100644 --- a/Changes.md +++ b/Changes.md @@ -1,5 +1,9 @@ # Change log +## 0.6.1 + +- Add support for `arrow=37` with the `arrow-37` feature + ## 0.6.0 ### Add support for arrow2 From 2410adfb4564acf86a0063a7550176398b6e6697 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 11 Apr 2023 19:37:13 +0200 Subject: [PATCH 5/5] Reformat code --- serde_arrow/benches/arrow2.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/serde_arrow/benches/arrow2.rs b/serde_arrow/benches/arrow2.rs index 48e776cc..68cb3c54 100644 --- a/serde_arrow/benches/arrow2.rs +++ b/serde_arrow/benches/arrow2.rs @@ -122,7 +122,6 @@ fn benchmark_serialize_arrow2_primitives(c: &mut Criterion) { group.finish(); } - fn benchmark_deserialize_arrow2_primitives(c: &mut Criterion) { let mut group = c.benchmark_group("deserialize_arrow2_primitives"); group.sample_size(20); @@ -139,7 +138,9 @@ fn benchmark_deserialize_arrow2_primitives(c: &mut Criterion) { group.bench_function("serde_arrow", |b| { b.iter(|| { - black_box::>(arrow2::deserialize_from_arrays(&fields, &arrays).unwrap()) + black_box::>( + arrow2::deserialize_from_arrays(&fields, &arrays).unwrap(), + ) }); });