Skip to content

Commit

Permalink
feat: Add support for variable size binary interop with arrow-rs (#191)
Browse files Browse the repository at this point in the history
Signed-off-by: Matthijs Brobbel <m1brobbel@gmail.com>
Co-authored-by: Matthijs Brobbel <m1brobbel@gmail.com>
  • Loading branch information
johanpel and mbrobbel authored Jun 25, 2024
1 parent 8565373 commit 40cb3a5
Show file tree
Hide file tree
Showing 7 changed files with 330 additions and 15 deletions.
9 changes: 8 additions & 1 deletion examples/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ fn main() {
use arrow_array::RecordBatch;
use arrow_cast::pretty;
use bytes::Bytes;
use narrow::{array::StructArray, arrow::buffer::ScalarBuffer, ArrayType};
use narrow::{
array::{StructArray, VariableSizeBinary},
arrow::buffer::ScalarBuffer,
ArrayType,
};
use parquet::arrow::{arrow_reader::ParquetRecordBatchReader, ArrowWriter};
use uuid::Uuid;

Expand All @@ -20,6 +24,7 @@ fn main() {
f: Bar,
g: [u8; 8],
h: Uuid,
i: VariableSizeBinary,
}
let input = [
Foo {
Expand All @@ -31,6 +36,7 @@ fn main() {
f: Bar(Some(true)),
g: [1, 2, 3, 4, 5, 6, 7, 8],
h: Uuid::from_u128(1234),
i: vec![1, 3, 3, 7].into(),
},
Foo {
a: 42,
Expand All @@ -41,6 +47,7 @@ fn main() {
f: Bar(None),
g: [9, 10, 11, 12, 13, 14, 15, 16],
h: Uuid::from_u128(42),
i: vec![4, 2].into(),
},
];

Expand Down
44 changes: 44 additions & 0 deletions src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
use crate::{
buffer::BufferType,
offset::{self, OffsetElement},
Length,
};
use std::{collections::VecDeque, marker::PhantomData};

Expand Down Expand Up @@ -163,6 +164,49 @@ impl<const N: usize> From<FixedSizeBinary<N>> for [u8; N] {
}
}

/// An byte vector wrapper that maps to [`VariableSizeBinaryArray`] via its
/// [`ArrayType`] implementation. Used for example to map `Vec<u8>` to
/// a [`VariableSizeBinaryArray`] instead of a [`VariableSizeListArray`].
#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct VariableSizeBinary(Vec<u8>);

impl ArrayType<VariableSizeBinary> for VariableSizeBinary {
type Array<Buffer: BufferType, OffsetItem: OffsetElement, UnionLayout: UnionType> =
VariableSizeBinaryArray<false, OffsetItem, Buffer>;
}

impl ArrayType<VariableSizeBinary> for Option<VariableSizeBinary> {
type Array<Buffer: BufferType, OffsetItem: OffsetElement, UnionLayout: UnionType> =
VariableSizeBinaryArray<true, OffsetItem, Buffer>;
}

impl From<Vec<u8>> for VariableSizeBinary {
fn from(value: Vec<u8>) -> Self {
Self(value)
}
}

impl From<VariableSizeBinary> for Vec<u8> {
fn from(value: VariableSizeBinary) -> Self {
value.0
}
}

impl Length for VariableSizeBinary {
fn len(&self) -> usize {
self.0.len()
}
}

impl IntoIterator for VariableSizeBinary {
type Item = u8;
type IntoIter = std::vec::IntoIter<u8>;

fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
}
}

impl<T: ArrayType<T>, const N: usize> ArrayType<[T; N]> for [T; N] {
type Array<Buffer: BufferType, OffsetItem: OffsetElement, UnionLayout: UnionType> =
FixedSizeListArray<
Expand Down
32 changes: 18 additions & 14 deletions src/array/variable_size_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,13 @@ where
impl<T, const NULLABLE: bool, OffsetItem: OffsetElement, Buffer: BufferType> FromIterator<T>
for VariableSizeBinaryArray<NULLABLE, OffsetItem, Buffer>
where
T: IntoIterator,
<Buffer as BufferType>::Buffer<OffsetItem>: Validity<NULLABLE>,
Offset<FixedSizePrimitiveArray<u8, false, Buffer>, NULLABLE, OffsetItem, Buffer>:
FromIterator<T>,
FromIterator<<T as IntoIterator>::IntoIter>,
{
fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
Self(iter.into_iter().collect())
Self(iter.into_iter().map(IntoIterator::into_iter).collect())
}
}

Expand Down Expand Up @@ -217,17 +218,26 @@ impl<OffsetItem: OffsetElement, Buffer: BufferType> ValidityBitmap
#[cfg(test)]
mod tests {
use super::*;
use crate::buffer::BufferRef;
use crate::{array::VariableSizeBinary, buffer::BufferRef};
use std::mem;

#[test]
fn from_iter() {
let input: [&[u8]; 4] = [&[1], &[2, 3], &[4, 5, 6], &[7, 8, 9, 0]];
fn from_variable_size_binary() {
let input: [Vec<u8>; 4] = [vec![0, 1, 2], vec![3], vec![], vec![4, 5]];
let array = input
.into_iter()
.map(<[u8]>::to_vec)
.map(VariableSizeBinary)
.collect::<VariableSizeBinaryArray>();
assert_eq!(array.len(), 4);
assert_eq!(array.0.data.0, &[0, 1, 2, 3, 4, 5]);
assert_eq!(array.0.offsets, &[0, 3, 4, 4, 6]);
}

#[test]
fn from_iter() {
let input: [&[u8]; 4] = [&[1], &[2, 3], &[4, 5, 6], &[7, 8, 9, 0]];
let array = input.into_iter().collect::<VariableSizeBinaryArray>();
assert_eq!(array.len(), 4);
assert_eq!(array.0.data.0, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]);
assert_eq!(array.0.offsets, &[0, 1, 3, 6, 10]);

Expand All @@ -241,10 +251,7 @@ mod tests {
#[test]
fn from_iter_nullable() {
let input: [Option<&[u8]>; 4] = [Some(&[1]), None, Some(&[4, 5, 6]), Some(&[7, 8, 9, 0])];
let array = input
.into_iter()
.map(|x| x.map(<[u8]>::to_vec))
.collect::<VariableSizeBinaryArray<true>>();
let array = input.into_iter().collect::<VariableSizeBinaryArray<true>>();
assert_eq!(array.len(), 4);
assert_eq!(array.0.data.0, &[1, 4, 5, 6, 7, 8, 9, 0]);
assert_eq!(array.0.offsets.as_ref(), &[0, 1, 1, 4, 8]);
Expand All @@ -271,10 +278,7 @@ mod tests {
#[test]
fn index() {
let input: [&[u8]; 4] = [&[1], &[2, 3], &[4, 5, 6], &[7, 8, 9, 0]];
let array = input
.into_iter()
.map(<[u8]>::to_vec)
.collect::<VariableSizeBinaryArray>();
let array = input.into_iter().collect::<VariableSizeBinaryArray>();
assert_eq!(array.index_checked(0), &[1]);
assert_eq!(array.index_checked(1), &[2, 3]);
assert_eq!(array.index_checked(2), &[4, 5, 6]);
Expand Down
1 change: 1 addition & 0 deletions src/arrow/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ mod logical;
mod null;
mod union;
pub use union::UnionArrayTypeFields;
mod variable_size_binary;
mod variable_size_list;
Loading

0 comments on commit 40cb3a5

Please sign in to comment.