From 36d3b7c9570dd51f09a4c7164efdb22514a7ece8 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 29 Feb 2024 09:15:23 +0100 Subject: [PATCH] perf: parquet elide utf8/binary cast (#14757) --- .../src/arrow/read/schema/convert.rs | 36 +++++++++---------- .../polars/tests/it/io/parquet/arrow/read.rs | 15 ++++---- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/crates/polars-parquet/src/arrow/read/schema/convert.rs b/crates/polars-parquet/src/arrow/read/schema/convert.rs index d4d4ef15f19b..2089e261188f 100644 --- a/crates/polars-parquet/src/arrow/read/schema/convert.rs +++ b/crates/polars-parquet/src/arrow/read/schema/convert.rs @@ -150,15 +150,15 @@ fn from_byte_array( converted_type: &Option, ) -> ArrowDataType { match (logical_type, converted_type) { - (Some(PrimitiveLogicalType::String), _) => ArrowDataType::LargeUtf8, - (Some(PrimitiveLogicalType::Json), _) => ArrowDataType::LargeBinary, - (Some(PrimitiveLogicalType::Bson), _) => ArrowDataType::LargeBinary, - (Some(PrimitiveLogicalType::Enum), _) => ArrowDataType::LargeBinary, - (_, Some(PrimitiveConvertedType::Json)) => ArrowDataType::LargeBinary, - (_, Some(PrimitiveConvertedType::Bson)) => ArrowDataType::LargeBinary, - (_, Some(PrimitiveConvertedType::Enum)) => ArrowDataType::LargeBinary, - (_, Some(PrimitiveConvertedType::Utf8)) => ArrowDataType::LargeUtf8, - (_, _) => ArrowDataType::LargeBinary, + (Some(PrimitiveLogicalType::String), _) => ArrowDataType::Utf8View, + (Some(PrimitiveLogicalType::Json), _) => ArrowDataType::BinaryView, + (Some(PrimitiveLogicalType::Bson), _) => ArrowDataType::BinaryView, + (Some(PrimitiveLogicalType::Enum), _) => ArrowDataType::BinaryView, + (_, Some(PrimitiveConvertedType::Json)) => ArrowDataType::BinaryView, + (_, Some(PrimitiveConvertedType::Bson)) => ArrowDataType::BinaryView, + (_, Some(PrimitiveConvertedType::Enum)) => ArrowDataType::BinaryView, + (_, Some(PrimitiveConvertedType::Utf8)) => ArrowDataType::Utf8View, + (_, _) => ArrowDataType::BinaryView, } } @@ -439,8 +439,8 @@ mod tests { Field::new("int64", ArrowDataType::Int64, false), Field::new("double", ArrowDataType::Float64, true), Field::new("float", ArrowDataType::Float32, true), - Field::new("string", ArrowDataType::LargeUtf8, true), - Field::new("string_2", ArrowDataType::LargeUtf8, true), + Field::new("string", ArrowDataType::Utf8View, true), + Field::new("string_2", ArrowDataType::Utf8View, true), ]; let parquet_schema = SchemaDescriptor::try_from_message(message)?; @@ -459,7 +459,7 @@ mod tests { } "; let expected = vec![ - Field::new("binary", ArrowDataType::LargeBinary, false), + Field::new("binary", ArrowDataType::BinaryView, false), Field::new("fixed_binary", ArrowDataType::FixedSizeBinary(20), false), ]; @@ -732,7 +732,7 @@ mod tests { { let struct_fields = vec![ - Field::new("event_name", ArrowDataType::LargeUtf8, false), + Field::new("event_name", ArrowDataType::Utf8View, false), Field::new( "event_time", ArrowDataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), @@ -792,7 +792,7 @@ mod tests { "my_list1", ArrowDataType::LargeList(Box::new(Field::new( "element", - ArrowDataType::LargeUtf8, + ArrowDataType::Utf8View, true, ))), false, @@ -810,7 +810,7 @@ mod tests { "my_list2", ArrowDataType::LargeList(Box::new(Field::new( "element", - ArrowDataType::LargeUtf8, + ArrowDataType::Utf8View, false, ))), true, @@ -828,7 +828,7 @@ mod tests { "my_list3", ArrowDataType::LargeList(Box::new(Field::new( "element", - ArrowDataType::LargeUtf8, + ArrowDataType::Utf8View, false, ))), false, @@ -1058,7 +1058,7 @@ mod tests { Field::new("int64", ArrowDataType::Int64, false), Field::new("double", ArrowDataType::Float64, true), Field::new("float", ArrowDataType::Float32, true), - Field::new("string", ArrowDataType::LargeUtf8, true), + Field::new("string", ArrowDataType::Utf8View, true), Field::new( "bools", ArrowDataType::LargeList(Box::new(Field::new( @@ -1115,7 +1115,7 @@ mod tests { ]), false, ), - Field::new("dictionary_strings", ArrowDataType::LargeUtf8, false), + Field::new("dictionary_strings", ArrowDataType::Utf8View, false), ]; let parquet_schema = SchemaDescriptor::try_from_message(message_type)?; diff --git a/crates/polars/tests/it/io/parquet/arrow/read.rs b/crates/polars/tests/it/io/parquet/arrow/read.rs index fa6502557d81..1767c63da388 100644 --- a/crates/polars/tests/it/io/parquet/arrow/read.rs +++ b/crates/polars/tests/it/io/parquet/arrow/read.rs @@ -35,11 +35,11 @@ fn all_types() -> PolarsResult<()> { let result = batches[0].columns()[9] .as_any() - .downcast_ref::>() + .downcast_ref::() .unwrap(); assert_eq!( result, - &BinaryArray::::from_slice([[48], [49], [48], [49], [48], [49], [48], [49]]) + &BinaryViewArray::from_slice_values([[48], [49], [48], [49], [48], [49], [48], [49]]) ); Ok(()) @@ -84,18 +84,21 @@ fn all_types_chunked() -> PolarsResult<()> { let result = batches[0].columns()[9] .as_any() - .downcast_ref::>() + .downcast_ref::() .unwrap(); assert_eq!( result, - &BinaryArray::::from_slice([[48], [49], [48], [49], [48]]) + &BinaryViewArray::from_slice_values([[48], [49], [48], [49], [48]]) ); let result = batches[1].columns()[9] .as_any() - .downcast_ref::>() + .downcast_ref::() .unwrap(); - assert_eq!(result, &BinaryArray::::from_slice([[49], [48], [49]])); + assert_eq!( + result, + &BinaryViewArray::from_slice_values([[49], [48], [49]]) + ); Ok(()) }