Skip to content

Commit

Permalink
perf: parquet elide utf8/binary cast (#14757)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Feb 29, 2024
1 parent 6b8f8ba commit 36d3b7c
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 24 deletions.
36 changes: 18 additions & 18 deletions crates/polars-parquet/src/arrow/read/schema/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,15 +150,15 @@ fn from_byte_array(
converted_type: &Option<PrimitiveConvertedType>,
) -> ArrowDataType {
match (logical_type, converted_type) {
(Some(PrimitiveLogicalType::String), _) => ArrowDataType::LargeUtf8,
(Some(PrimitiveLogicalType::Json), _) => ArrowDataType::LargeBinary,
(Some(PrimitiveLogicalType::Bson), _) => ArrowDataType::LargeBinary,
(Some(PrimitiveLogicalType::Enum), _) => ArrowDataType::LargeBinary,
(_, Some(PrimitiveConvertedType::Json)) => ArrowDataType::LargeBinary,
(_, Some(PrimitiveConvertedType::Bson)) => ArrowDataType::LargeBinary,
(_, Some(PrimitiveConvertedType::Enum)) => ArrowDataType::LargeBinary,
(_, Some(PrimitiveConvertedType::Utf8)) => ArrowDataType::LargeUtf8,
(_, _) => ArrowDataType::LargeBinary,
(Some(PrimitiveLogicalType::String), _) => ArrowDataType::Utf8View,
(Some(PrimitiveLogicalType::Json), _) => ArrowDataType::BinaryView,
(Some(PrimitiveLogicalType::Bson), _) => ArrowDataType::BinaryView,
(Some(PrimitiveLogicalType::Enum), _) => ArrowDataType::BinaryView,
(_, Some(PrimitiveConvertedType::Json)) => ArrowDataType::BinaryView,
(_, Some(PrimitiveConvertedType::Bson)) => ArrowDataType::BinaryView,
(_, Some(PrimitiveConvertedType::Enum)) => ArrowDataType::BinaryView,
(_, Some(PrimitiveConvertedType::Utf8)) => ArrowDataType::Utf8View,
(_, _) => ArrowDataType::BinaryView,
}
}

Expand Down Expand Up @@ -439,8 +439,8 @@ mod tests {
Field::new("int64", ArrowDataType::Int64, false),
Field::new("double", ArrowDataType::Float64, true),
Field::new("float", ArrowDataType::Float32, true),
Field::new("string", ArrowDataType::LargeUtf8, true),
Field::new("string_2", ArrowDataType::LargeUtf8, true),
Field::new("string", ArrowDataType::Utf8View, true),
Field::new("string_2", ArrowDataType::Utf8View, true),
];

let parquet_schema = SchemaDescriptor::try_from_message(message)?;
Expand All @@ -459,7 +459,7 @@ mod tests {
}
";
let expected = vec![
Field::new("binary", ArrowDataType::LargeBinary, false),
Field::new("binary", ArrowDataType::BinaryView, false),
Field::new("fixed_binary", ArrowDataType::FixedSizeBinary(20), false),
];

Expand Down Expand Up @@ -732,7 +732,7 @@ mod tests {

{
let struct_fields = vec![
Field::new("event_name", ArrowDataType::LargeUtf8, false),
Field::new("event_name", ArrowDataType::Utf8View, false),
Field::new(
"event_time",
ArrowDataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
Expand Down Expand Up @@ -792,7 +792,7 @@ mod tests {
"my_list1",
ArrowDataType::LargeList(Box::new(Field::new(
"element",
ArrowDataType::LargeUtf8,
ArrowDataType::Utf8View,
true,
))),
false,
Expand All @@ -810,7 +810,7 @@ mod tests {
"my_list2",
ArrowDataType::LargeList(Box::new(Field::new(
"element",
ArrowDataType::LargeUtf8,
ArrowDataType::Utf8View,
false,
))),
true,
Expand All @@ -828,7 +828,7 @@ mod tests {
"my_list3",
ArrowDataType::LargeList(Box::new(Field::new(
"element",
ArrowDataType::LargeUtf8,
ArrowDataType::Utf8View,
false,
))),
false,
Expand Down Expand Up @@ -1058,7 +1058,7 @@ mod tests {
Field::new("int64", ArrowDataType::Int64, false),
Field::new("double", ArrowDataType::Float64, true),
Field::new("float", ArrowDataType::Float32, true),
Field::new("string", ArrowDataType::LargeUtf8, true),
Field::new("string", ArrowDataType::Utf8View, true),
Field::new(
"bools",
ArrowDataType::LargeList(Box::new(Field::new(
Expand Down Expand Up @@ -1115,7 +1115,7 @@ mod tests {
]),
false,
),
Field::new("dictionary_strings", ArrowDataType::LargeUtf8, false),
Field::new("dictionary_strings", ArrowDataType::Utf8View, false),
];

let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
Expand Down
15 changes: 9 additions & 6 deletions crates/polars/tests/it/io/parquet/arrow/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ fn all_types() -> PolarsResult<()> {

let result = batches[0].columns()[9]
.as_any()
.downcast_ref::<BinaryArray<i64>>()
.downcast_ref::<BinaryViewArray>()
.unwrap();
assert_eq!(
result,
&BinaryArray::<i64>::from_slice([[48], [49], [48], [49], [48], [49], [48], [49]])
&BinaryViewArray::from_slice_values([[48], [49], [48], [49], [48], [49], [48], [49]])
);

Ok(())
Expand Down Expand Up @@ -84,18 +84,21 @@ fn all_types_chunked() -> PolarsResult<()> {

let result = batches[0].columns()[9]
.as_any()
.downcast_ref::<BinaryArray<i64>>()
.downcast_ref::<BinaryViewArray>()
.unwrap();
assert_eq!(
result,
&BinaryArray::<i64>::from_slice([[48], [49], [48], [49], [48]])
&BinaryViewArray::from_slice_values([[48], [49], [48], [49], [48]])
);

let result = batches[1].columns()[9]
.as_any()
.downcast_ref::<BinaryArray<i64>>()
.downcast_ref::<BinaryViewArray>()
.unwrap();
assert_eq!(result, &BinaryArray::<i64>::from_slice([[49], [48], [49]]));
assert_eq!(
result,
&BinaryViewArray::from_slice_values([[49], [48], [49]])
);

Ok(())
}
Expand Down

0 comments on commit 36d3b7c

Please sign in to comment.