Skip to content

Commit

Permalink
Add ExtensionType trait and CanonicalExtensionType enum (#5822)
Browse files Browse the repository at this point in the history
* Add `ExtensionType` for `uuid` and map to parquet logical type

* Fix docs

* Use an `ExtensionType` trait instead

* Fix clippy warnings

* Add type annotation to fix build

* Update `ExtensionType` trait to support more canonical extension types

* Add `Json` support to parquet, schema roundtrip not working yet

* Fix some clippy warnings

* Add explicit lifetime, resolving elided lifetime to static in assoc const was added in 1.81

* Replace use of deprecated method, mark roundtrip as todo

* Add more tests and missing impls

* Add missing type annotations

* Fix doc warning

* Add the feature to the `arrow` crate and use underscores

* Update feature name in `parquet` crate

* Add experimental warning to `extensions` module docs

* Add a note about the associated metadata type

* Fix `Json` canonical extension type empty string metadata

* Simplify `Bool8::deserialize_metadata`

* Use `Empty` instead of `serde_json::Map` in `JsonMetadata`

* Use `map_or` instead of `is_some_and` (msrv)

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
  • Loading branch information
mbrobbel and alamb authored Feb 2, 2025
1 parent 43617b2 commit 8baaa8b
Show file tree
Hide file tree
Showing 17 changed files with 2,350 additions and 12 deletions.
8 changes: 4 additions & 4 deletions arrow-array/src/array/list_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -895,8 +895,8 @@ mod tests {
.build()
.unwrap(),
);
assert_eq!(string.value_offsets(), &[]);
assert_eq!(string.value_sizes(), &[]);
assert_eq!(string.value_offsets(), &[] as &[i32; 0]);
assert_eq!(string.value_sizes(), &[] as &[i32; 0]);

let string = LargeListViewArray::from(
ArrayData::builder(DataType::LargeListView(f))
Expand All @@ -906,8 +906,8 @@ mod tests {
.unwrap(),
);
assert_eq!(string.len(), 0);
assert_eq!(string.value_offsets(), &[]);
assert_eq!(string.value_sizes(), &[]);
assert_eq!(string.value_offsets(), &[] as &[i64; 0]);
assert_eq!(string.value_sizes(), &[] as &[i64; 0]);
}

#[test]
Expand Down
12 changes: 9 additions & 3 deletions arrow-schema/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,27 @@ path = "src/lib.rs"
bench = false

[dependencies]
serde = { version = "1.0", default-features = false, features = ["derive", "std", "rc"], optional = true }
serde = { version = "1.0", default-features = false, features = [
"derive",
"std",
"rc",
], optional = true }
bitflags = { version = "2.0.0", default-features = false, optional = true }
serde_json = { version = "1.0", optional = true }

[features]
canonical_extension_types = ["dep:serde", "dep:serde_json"]
# Enable ffi support
ffi = ["bitflags"]
serde = ["dep:serde"]

[package.metadata.docs.rs]
features = ["ffi"]

[dev-dependencies]
serde_json = "1.0"
bincode = { version = "1.3.3", default-features = false }
criterion = { version = "0.5", default-features = false }

[[bench]]
name = "ffi"
harness = false
harness = false
142 changes: 142 additions & 0 deletions arrow-schema/src/extension/canonical/bool8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! 8-bit Boolean
//!
//! <https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
use crate::{extension::ExtensionType, ArrowError, DataType};

/// The extension type for `8-bit Boolean`.
///
/// Extension name: `arrow.bool8`.
///
/// The storage type of the extension is `Int8` where:
/// - false is denoted by the value 0.
/// - true can be specified using any non-zero value. Preferably 1.
///
/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
#[derive(Debug, Default, Clone, Copy, PartialEq)]
pub struct Bool8;

impl ExtensionType for Bool8 {
const NAME: &'static str = "arrow.bool8";

type Metadata = &'static str;

fn metadata(&self) -> &Self::Metadata {
&""
}

fn serialize_metadata(&self) -> Option<String> {
Some(String::default())
}

fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
if metadata.map_or(false, str::is_empty) {
Ok("")
} else {
Err(ArrowError::InvalidArgumentError(
"Bool8 extension type expects an empty string as metadata".to_owned(),
))
}
}

fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
match data_type {
DataType::Int8 => Ok(()),
data_type => Err(ArrowError::InvalidArgumentError(format!(
"Bool8 data type mismatch, expected Int8, found {data_type}"
))),
}
}

fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
Self.supports_data_type(data_type).map(|_| Self)
}
}

#[cfg(test)]
mod tests {
#[cfg(feature = "canonical_extension_types")]
use crate::extension::CanonicalExtensionType;
use crate::{
extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
Field,
};

use super::*;

#[test]
fn valid() -> Result<(), ArrowError> {
let mut field = Field::new("", DataType::Int8, false);
field.try_with_extension_type(Bool8)?;
field.try_extension_type::<Bool8>()?;
#[cfg(feature = "canonical_extension_types")]
assert_eq!(
field.try_canonical_extension_type()?,
CanonicalExtensionType::Bool8(Bool8)
);

Ok(())
}

#[test]
#[should_panic(expected = "Field extension type name missing")]
fn missing_name() {
let field = Field::new("", DataType::Int8, false).with_metadata(
[(EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned())]
.into_iter()
.collect(),
);
field.extension_type::<Bool8>();
}

#[test]
#[should_panic(expected = "expected Int8, found Boolean")]
fn invalid_type() {
Field::new("", DataType::Boolean, false).with_extension_type(Bool8);
}

#[test]
#[should_panic(expected = "Bool8 extension type expects an empty string as metadata")]
fn missing_metadata() {
let field = Field::new("", DataType::Int8, false).with_metadata(
[(EXTENSION_TYPE_NAME_KEY.to_owned(), Bool8::NAME.to_owned())]
.into_iter()
.collect(),
);
field.extension_type::<Bool8>();
}

#[test]
#[should_panic(expected = "Bool8 extension type expects an empty string as metadata")]
fn invalid_metadata() {
let field = Field::new("", DataType::Int8, false).with_metadata(
[
(EXTENSION_TYPE_NAME_KEY.to_owned(), Bool8::NAME.to_owned()),
(
EXTENSION_TYPE_METADATA_KEY.to_owned(),
"non-empty".to_owned(),
),
]
.into_iter()
.collect(),
);
field.extension_type::<Bool8>();
}
}
Loading

0 comments on commit 8baaa8b

Please sign in to comment.