diff --git a/crates/polars-compute/src/min_max/dyn_array.rs b/crates/polars-compute/src/min_max/dyn_array.rs index 7298634a7cc9..64883aff2ae0 100644 --- a/crates/polars-compute/src/min_max/dyn_array.rs +++ b/crates/polars-compute/src/min_max/dyn_array.rs @@ -1,11 +1,19 @@ -use arrow::array::{Array, BooleanArray, PrimitiveArray}; -use arrow::scalar::{BooleanScalar, PrimitiveScalar, Scalar}; +use arrow::array::{ + Array, BinaryArray, BinaryViewArray, BooleanArray, PrimitiveArray, Utf8Array, Utf8ViewArray, +}; +use arrow::scalar::{BinaryScalar, BinaryViewScalar, BooleanScalar, PrimitiveScalar, Scalar}; use crate::min_max::MinMaxKernel; + macro_rules! call_op { - ($T:ty, $scalar:ty, $(=> ($($arg:expr),+),)? $arr:expr, $op:path) => {{ + ($T:ty, $scalar:ty, $arr:expr, $op:path) => {{ + let arr: &$T = $arr.as_any().downcast_ref().unwrap(); + $op(arr).map(|v| Box::new(<$scalar>::new(Some(v))) as Box) + }}; + (dt: $T:ty, $scalar:ty, $arr:expr, $op:path) => {{ let arr: &$T = $arr.as_any().downcast_ref().unwrap(); - $op(arr).map(|v| Box::new(<$scalar>::new($($($arg,)+)? Some(v))) as Box) + $op(arr) + .map(|v| Box::new(<$scalar>::new(arr.data_type().clone(), Some(v))) as Box) }}; } @@ -14,40 +22,30 @@ macro_rules! call { let arr = $arr; use arrow::datatypes::{PhysicalType as PH, PrimitiveType as PR}; + use PrimitiveArray as PArr; + use PrimitiveScalar as PScalar; match arr.data_type().to_physical_type() { PH::Boolean => call_op!(BooleanArray, BooleanScalar, arr, $op), - PH::Primitive(PR::Int8) => call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op), - PH::Primitive(PR::Int16) => { - call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op) - }, - PH::Primitive(PR::Int32) => { - call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op) - }, - PH::Primitive(PR::Int64) => { - call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op) - }, - PH::Primitive(PR::Int128) => { - call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op) - }, - PH::Primitive(PR::UInt8) => call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op), - PH::Primitive(PR::UInt16) => { - call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op) - }, - PH::Primitive(PR::UInt32) => { - call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op) - }, - PH::Primitive(PR::UInt64) => { - call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op) - }, - PH::Primitive(PR::UInt128) => { - call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op) - }, - PH::Primitive(PR::Float32) => { - call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op) - }, - PH::Primitive(PR::Float64) => { - call_op!(PrimitiveArray, PrimitiveScalar, => (arr.data_type().clone()), arr, $op) - }, + PH::Primitive(PR::Int8) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::Int16) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::Int32) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::Int64) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::Int128) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::UInt8) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::UInt16) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::UInt32) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::UInt64) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::UInt128) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::Float32) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Primitive(PR::Float64) => call_op!(dt: PArr, PScalar, arr, $op), + + PH::BinaryView => call_op!(BinaryViewArray, BinaryViewScalar<[u8]>, arr, $op), + PH::Utf8View => call_op!(Utf8ViewArray, BinaryViewScalar, arr, $op), + + PH::Binary => call_op!(BinaryArray, BinaryScalar, arr, $op), + PH::LargeBinary => call_op!(BinaryArray, BinaryScalar, arr, $op), + PH::Utf8 => call_op!(Utf8Array, BinaryScalar, arr, $op), + PH::LargeUtf8 => call_op!(Utf8Array, BinaryScalar, arr, $op), _ => todo!("Dynamic MinMax is not yet implemented for {:?}", arr.data_type()), } diff --git a/crates/polars-compute/src/min_max/mod.rs b/crates/polars-compute/src/min_max/mod.rs index e16f43f11eb9..32b22f06b84f 100644 --- a/crates/polars-compute/src/min_max/mod.rs +++ b/crates/polars-compute/src/min_max/mod.rs @@ -5,7 +5,7 @@ pub use self::dyn_array::{ dyn_array_min_propagate_nan, }; -// Low-level min/max kernel. +/// Low-level min/max kernel. pub trait MinMaxKernel { type Scalar<'a>: MinMax where diff --git a/crates/polars-compute/src/min_max/scalar.rs b/crates/polars-compute/src/min_max/scalar.rs index f64b41330b56..5a14fc571a08 100644 --- a/crates/polars-compute/src/min_max/scalar.rs +++ b/crates/polars-compute/src/min_max/scalar.rs @@ -1,5 +1,7 @@ -use arrow::array::{Array, BinaryViewArray, BooleanArray, PrimitiveArray, Utf8ViewArray}; -use arrow::types::NativeType; +use arrow::array::{ + Array, BinaryArray, BinaryViewArray, BooleanArray, PrimitiveArray, Utf8Array, Utf8ViewArray, +}; +use arrow::types::{NativeType, Offset}; use polars_utils::min_max::MinMax; use super::MinMaxKernel; @@ -149,3 +151,65 @@ impl MinMaxKernel for Utf8ViewArray { self.max_ignore_nan_kernel() } } + +impl MinMaxKernel for BinaryArray { + type Scalar<'a> = &'a [u8]; + + fn min_ignore_nan_kernel(&self) -> Option> { + if self.null_count() == 0 { + self.values_iter().reduce(MinMax::min_ignore_nan) + } else { + self.non_null_values_iter().reduce(MinMax::min_ignore_nan) + } + } + + fn max_ignore_nan_kernel(&self) -> Option> { + if self.null_count() == 0 { + self.values_iter().reduce(MinMax::max_ignore_nan) + } else { + self.non_null_values_iter().reduce(MinMax::max_ignore_nan) + } + } + + #[inline(always)] + fn min_propagate_nan_kernel(&self) -> Option> { + self.min_ignore_nan_kernel() + } + + #[inline(always)] + fn max_propagate_nan_kernel(&self) -> Option> { + self.max_ignore_nan_kernel() + } +} + +impl MinMaxKernel for Utf8Array { + type Scalar<'a> = &'a str; + + #[inline(always)] + fn min_ignore_nan_kernel(&self) -> Option> { + self.to_binary().min_ignore_nan_kernel().map(|s| unsafe { + // SAFETY: the lifetime is the same, and it is valid UTF-8. + #[allow(clippy::transmute_bytes_to_str)] + std::mem::transmute::<&[u8], &str>(s) + }) + } + + #[inline(always)] + fn max_ignore_nan_kernel(&self) -> Option> { + self.to_binary().max_ignore_nan_kernel().map(|s| unsafe { + // SAFETY: the lifetime is the same, and it is valid UTF-8. + #[allow(clippy::transmute_bytes_to_str)] + std::mem::transmute::<&[u8], &str>(s) + }) + } + + #[inline(always)] + fn min_propagate_nan_kernel(&self) -> Option> { + self.min_ignore_nan_kernel() + } + + #[inline(always)] + fn max_propagate_nan_kernel(&self) -> Option> { + self.max_ignore_nan_kernel() + } +} diff --git a/crates/polars-compute/src/min_max/simd.rs b/crates/polars-compute/src/min_max/simd.rs index e72df453c54d..8c25725b618b 100644 --- a/crates/polars-compute/src/min_max/simd.rs +++ b/crates/polars-compute/src/min_max/simd.rs @@ -30,6 +30,10 @@ where F: FnMut(Simd, Simd) -> Simd, LaneCount: SupportedLaneCount, { + if arr.is_empty() { + return None; + } + let mut arr_chunks = arr.chunks_exact(N); let identity = Simd::splat(scalar_identity); diff --git a/crates/polars-parquet/src/arrow/write/binary/basic.rs b/crates/polars-parquet/src/arrow/write/binary/basic.rs index 895c1c3a762e..e675986d81cc 100644 --- a/crates/polars-parquet/src/arrow/write/binary/basic.rs +++ b/crates/polars-parquet/src/arrow/write/binary/basic.rs @@ -91,29 +91,19 @@ pub(crate) fn build_statistics( primitive_type: PrimitiveType, options: &StatisticsOptions, ) -> ParquetStatistics { + use polars_compute::min_max::MinMaxKernel; + BinaryStatistics { primitive_type, null_count: options.null_count.then_some(array.null_count() as i64), distinct_count: None, max_value: options .max_value - .then(|| { - array - .iter() - .flatten() - .max_by(|x, y| ord_binary(x, y)) - .map(|x| x.to_vec()) - }) + .then(|| array.max_propagate_nan_kernel().map(<[u8]>::to_vec)) .flatten(), min_value: options .min_value - .then(|| { - array - .iter() - .flatten() - .min_by(|x, y| ord_binary(x, y)) - .map(|x| x.to_vec()) - }) + .then(|| array.min_propagate_nan_kernel().map(<[u8]>::to_vec)) .flatten(), } .serialize()