From 0a783c2e48b5769d727c3c2fce521c6221764621 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 4 Oct 2024 08:11:40 +0200 Subject: [PATCH 01/17] big boy commit [skip ci] --- .../src/array/growable/structure.rs | 6 + crates/polars-arrow/src/array/struct_/data.rs | 1 + crates/polars-arrow/src/array/struct_/ffi.rs | 2 +- crates/polars-arrow/src/array/struct_/mod.rs | 61 ++-- .../polars-arrow/src/array/struct_/mutable.rs | 44 ++- crates/polars-arrow/src/compute/cast/mod.rs | 1 + .../src/compute/take/structure.rs | 2 +- .../src/io/avro/read/deserialize.rs | 3 +- .../polars-arrow/src/io/avro/read/nested.rs | 10 +- .../src/io/ipc/read/array/struct_.rs | 8 +- crates/polars-arrow/src/legacy/array/mod.rs | 8 +- crates/polars-arrow/src/legacy/conversion.rs | 3 +- crates/polars-core/src/chunked_array/cast.rs | 2 +- .../logical/categorical/ops/unique.rs | 3 +- .../polars-core/src/chunked_array/ops/full.rs | 3 +- .../polars-core/src/chunked_array/ops/mod.rs | 3 +- .../src/chunked_array/ops/sort/mod.rs | 3 +- .../polars-core/src/chunked_array/ops/zip.rs | 2 +- .../polars-core/src/chunked_array/random.rs | 10 +- .../src/chunked_array/struct_/frame.rs | 2 +- .../src/chunked_array/struct_/mod.rs | 97 +++--- crates/polars-core/src/frame/arithmetic.rs | 2 +- crates/polars-core/src/frame/chunks.rs | 3 +- crates/polars-core/src/frame/column/mod.rs | 2 +- crates/polars-core/src/frame/from.rs | 2 +- crates/polars-core/src/frame/group_by/mod.rs | 2 +- crates/polars-core/src/frame/mod.rs | 288 ++++++++++++------ crates/polars-core/src/frame/row/av_buffer.rs | 21 +- crates/polars-core/src/frame/row/transpose.rs | 2 +- crates/polars-core/src/functions.rs | 2 +- crates/polars-core/src/serde/series.rs | 8 +- crates/polars-core/src/series/any_value.rs | 18 +- crates/polars-core/src/series/from.rs | 1 + crates/polars-core/src/series/into.rs | 8 +- crates/polars-core/src/series/mod.rs | 5 +- crates/polars-core/src/series/ops/null.rs | 2 +- crates/polars-core/src/utils/flatten.rs | 6 +- .../src/expressions/aggregation.rs | 10 +- crates/polars-expr/src/expressions/window.rs | 4 +- crates/polars-expr/src/state/node_timer.rs | 3 +- crates/polars-io/src/csv/read/read_impl.rs | 6 +- crates/polars-io/src/csv/read/reader.rs | 43 ++- crates/polars-io/src/hive.rs | 17 +- .../polars-io/src/parquet/read/read_impl.rs | 9 +- crates/polars-json/src/json/deserialize.rs | 2 +- crates/polars-lazy/src/dsl/eval.rs | 4 +- crates/polars-lazy/src/dsl/list.rs | 4 +- .../src/executors/group_by_partitioned.rs | 2 +- .../src/executors/projection_utils.rs | 7 +- .../src/executors/scan/python_scan.rs | 8 +- .../polars-mem-engine/src/executors/stack.rs | 2 +- .../src/chunked_array/array/to_struct.rs | 2 +- .../src/chunked_array/gather/chunked.rs | 8 +- crates/polars-ops/src/chunked_array/hist.rs | 2 +- .../src/chunked_array/list/to_struct.rs | 2 +- .../src/chunked_array/strings/extract.rs | 3 +- .../src/chunked_array/strings/json_path.rs | 1 + .../src/chunked_array/strings/split.rs | 4 +- .../polars-ops/src/frame/join/asof/groups.rs | 2 +- .../polars-ops/src/frame/join/merge_sorted.rs | 4 +- crates/polars-ops/src/frame/pivot/mod.rs | 2 +- .../polars-ops/src/frame/pivot/positioning.rs | 9 +- crates/polars-ops/src/frame/pivot/unpivot.rs | 2 +- crates/polars-ops/src/series/ops/cut.rs | 2 +- .../polars-ops/src/series/ops/horizontal.rs | 12 +- crates/polars-ops/src/series/ops/replace.rs | 3 +- crates/polars-ops/src/series/ops/rle.rs | 2 +- .../polars-ops/src/series/ops/to_dummies.rs | 8 +- crates/polars-ops/src/series/ops/various.rs | 3 +- .../src/arrow/read/deserialize/nested.rs | 3 +- .../src/arrow/read/statistics/mod.rs | 14 +- .../src/arrow/read/statistics/struct_.rs | 14 +- .../polars-parquet/src/arrow/write/pages.rs | 6 +- .../src/executors/operators/projection.rs | 5 +- .../src/executors/operators/reproject.rs | 2 +- .../sinks/group_by/generic/hash_table.rs | 2 +- .../executors/sinks/group_by/generic/mod.rs | 4 +- .../executors/sinks/group_by/primitive/mod.rs | 2 +- .../src/executors/sinks/group_by/string.rs | 2 +- .../src/executors/sinks/group_by/utils.rs | 4 +- .../sinks/joins/generic_probe_outer.rs | 4 + .../src/executors/sinks/sort/sink_multiple.rs | 99 +++--- .../polars-pipe/src/executors/sources/csv.rs | 4 + .../src/dsl/function_expr/coerce.rs | 21 +- .../src/dsl/function_expr/struct_.rs | 9 +- .../src/dsl/functions/horizontal.rs | 5 +- crates/polars-plan/src/dsl/name.rs | 2 +- .../src/plans/functions/merge_sorted.rs | 6 +- .../polars-plan/src/plans/functions/rename.rs | 3 +- crates/polars-python/src/dataframe/general.rs | 4 +- .../polars-python/src/interop/arrow/to_py.rs | 9 +- .../src/interop/arrow/to_rust.rs | 2 +- crates/polars-python/src/map/mod.rs | 10 +- .../polars-python/src/series/construction.rs | 2 + crates/polars-row/src/decode.rs | 3 +- .../nodes/parquet_source/row_group_decode.rs | 37 ++- .../tests/it/arrow/array/growable/mod.rs | 1 + .../tests/it/arrow/array/growable/struct_.rs | 12 +- crates/polars/tests/it/arrow/array/map/mod.rs | 6 + .../tests/it/arrow/array/struct_/iterator.rs | 1 + .../tests/it/arrow/array/struct_/mod.rs | 2 +- .../tests/it/arrow/array/struct_/mutable.rs | 31 -- crates/polars/tests/it/arrow/scalar/map.rs | 3 + crates/polars/tests/it/io/avro/read.rs | 2 + crates/polars/tests/it/io/avro/write.rs | 2 + .../polars/tests/it/io/parquet/arrow/mod.rs | 84 ++++- py-polars/tests/unit/datatypes/test_struct.py | 4 +- py-polars/tests/unit/io/test_json.py | 10 +- 108 files changed, 797 insertions(+), 477 deletions(-) delete mode 100644 crates/polars/tests/it/arrow/array/struct_/mutable.rs diff --git a/crates/polars-arrow/src/array/growable/structure.rs b/crates/polars-arrow/src/array/growable/structure.rs index 5f3d0c107c62..6c076864f585 100644 --- a/crates/polars-arrow/src/array/growable/structure.rs +++ b/crates/polars-arrow/src/array/growable/structure.rs @@ -10,6 +10,7 @@ use crate::bitmap::MutableBitmap; /// Concrete [`Growable`] for the [`StructArray`]. pub struct GrowableStruct<'a> { arrays: Vec<&'a StructArray>, + length: usize, validity: Option, values: Vec + 'a>>, } @@ -48,6 +49,7 @@ impl<'a> GrowableStruct<'a> { Self { arrays, + length: 0, values, validity: prepare_validity(use_validity, capacity), } @@ -60,6 +62,7 @@ impl<'a> GrowableStruct<'a> { StructArray::new( self.arrays[0].dtype().clone(), + self.length, values, validity.map(|v| v.into()), ) @@ -71,6 +74,8 @@ impl<'a> Growable<'a> for GrowableStruct<'a> { let array = *self.arrays.get_unchecked_release(index); extend_validity(&mut self.validity, array, start, len); + self.length += len; + if array.null_count() == 0 { self.values .iter_mut() @@ -123,6 +128,7 @@ impl<'a> From> for StructArray { StructArray::new( val.arrays[0].dtype().clone(), + val.length, values, val.validity.map(|v| v.into()), ) diff --git a/crates/polars-arrow/src/array/struct_/data.rs b/crates/polars-arrow/src/array/struct_/data.rs index ca8c5b0c6ec3..a65b491bfe77 100644 --- a/crates/polars-arrow/src/array/struct_/data.rs +++ b/crates/polars-arrow/src/array/struct_/data.rs @@ -21,6 +21,7 @@ impl Arrow2Arrow for StructArray { Self { dtype, + length: data.len(), values: data.child_data().iter().map(from_data).collect(), validity: data.nulls().map(|n| Bitmap::from_null_buffer(n.clone())), } diff --git a/crates/polars-arrow/src/array/struct_/ffi.rs b/crates/polars-arrow/src/array/struct_/ffi.rs index 3bfb9a1a7d7f..cc56f0f12cf3 100644 --- a/crates/polars-arrow/src/array/struct_/ffi.rs +++ b/crates/polars-arrow/src/array/struct_/ffi.rs @@ -68,6 +68,6 @@ impl FromFfi for StructArray { }) .collect::>>>()?; - Self::try_new(dtype, values, validity) + Self::try_new(dtype, len, values, validity) } } diff --git a/crates/polars-arrow/src/array/struct_/mod.rs b/crates/polars-arrow/src/array/struct_/mod.rs index decc95a2627a..6eebbc783a13 100644 --- a/crates/polars-arrow/src/array/struct_/mod.rs +++ b/crates/polars-arrow/src/array/struct_/mod.rs @@ -9,7 +9,7 @@ pub(super) mod fmt; mod iterator; mod mutable; pub use mutable::*; -use polars_error::{polars_bail, PolarsResult}; +use polars_error::{polars_bail, polars_ensure, PolarsResult}; use crate::compute::utils::combine_validities_and; @@ -34,6 +34,8 @@ pub struct StructArray { dtype: ArrowDataType, // invariant: each array has the same length values: Vec>, + // invariant: for each v in values: length == v.len() + length: usize, validity: Option, } @@ -49,22 +51,17 @@ impl StructArray { /// * the validity's length is not equal to the length of the first element pub fn try_new( dtype: ArrowDataType, + length: usize, values: Vec>, validity: Option, ) -> PolarsResult { let fields = Self::try_get_fields(&dtype)?; - if fields.is_empty() { - assert!(values.is_empty(), "invalid struct"); - assert_eq!(validity.map(|v| v.len()).unwrap_or(0), 0, "invalid struct"); - return Ok(Self { - dtype, - values, - validity: None, - }); - } - if fields.len() != values.len() { - polars_bail!(ComputeError:"a StructArray must have a number of fields in its DataType equal to the number of child values") - } + + polars_ensure!( + fields.len() == values.len(), + ComputeError: + "a StructArray must have a number of fields in its DataType equal to the number of child values" + ); fields .iter().map(|a| &a.dtype) @@ -81,15 +78,14 @@ impl StructArray { } })?; - let len = values[0].len(); values .iter() - .map(|a| a.len()) + .map(|f| f.len()) .enumerate() - .try_for_each(|(index, a_len)| { - if a_len != len { - polars_bail!(ComputeError: "The children must have an equal number of values. - However, the values at index {index} have a length of {a_len}, which is different from values at index 0, {len}.") + .try_for_each(|(index, f_length)| { + if f_length != length { + polars_bail!(ComputeError: "The children must have the given number of values. + However, the values at index {index} have a length of {f_length}, which is different from given length {length}.") } else { Ok(()) } @@ -97,13 +93,14 @@ impl StructArray { if validity .as_ref() - .map_or(false, |validity| validity.len() != len) + .map_or(false, |validity| validity.len() != length) { polars_bail!(ComputeError:"The validity length of a StructArray must match its number of elements") } Ok(Self { dtype, + length, values, validity, }) @@ -120,10 +117,11 @@ impl StructArray { /// * the validity's length is not equal to the length of the first element pub fn new( dtype: ArrowDataType, + length: usize, values: Vec>, validity: Option, ) -> Self { - Self::try_new(dtype, values, validity).unwrap() + Self::try_new(dtype, length, values, validity).unwrap() } /// Creates an empty [`StructArray`]. @@ -133,7 +131,7 @@ impl StructArray { .iter() .map(|field| new_empty_array(field.dtype().clone())) .collect(); - Self::new(dtype, values, None) + Self::new(dtype, 0, values, None) } else { panic!("StructArray must be initialized with DataType::Struct"); } @@ -146,7 +144,7 @@ impl StructArray { .iter() .map(|field| new_null_array(field.dtype().clone(), length)) .collect(); - Self::new(dtype, values, Some(Bitmap::new_zeroed(length))) + Self::new(dtype, length, values, Some(Bitmap::new_zeroed(length))) } else { panic!("StructArray must be initialized with DataType::Struct"); } @@ -157,9 +155,10 @@ impl StructArray { impl StructArray { /// Deconstructs the [`StructArray`] into its individual components. #[must_use] - pub fn into_data(self) -> (Vec, Vec>, Option) { + pub fn into_data(self) -> (Vec, usize, Vec>, Option) { let Self { dtype, + length, values, validity, } = self; @@ -168,7 +167,7 @@ impl StructArray { } else { unreachable!() }; - (fields, values, validity) + (fields, length, values, validity) } /// Slices this [`StructArray`]. @@ -199,6 +198,7 @@ impl StructArray { self.values .iter_mut() .for_each(|x| x.slice_unchecked(offset, length)); + self.length = length; } /// Set the outer nulls into the inner arrays. @@ -227,18 +227,17 @@ impl StructArray { impl StructArray { #[inline] fn len(&self) -> usize { - #[cfg(debug_assertions)] - if let Some(fst) = self.values.first() { - for arr in self.values.iter().skip(1) { + if cfg!(debug_assertions) { + for arr in self.values.iter() { assert_eq!( arr.len(), - fst.len(), + self.length, "StructArray invariant: each array has same length" ); } } - self.values.first().map(|arr| arr.len()).unwrap_or(0) + self.length } /// The optional validity. @@ -310,11 +309,13 @@ impl Splitable for StructArray { ( Self { dtype: self.dtype.clone(), + length: offset, values: lhs_values, validity: lhs_validity, }, Self { dtype: self.dtype.clone(), + length: self.length - offset, values: rhs_values, validity: rhs_validity, }, diff --git a/crates/polars-arrow/src/array/struct_/mutable.rs b/crates/polars-arrow/src/array/struct_/mutable.rs index 286db07e2f97..ab4090053899 100644 --- a/crates/polars-arrow/src/array/struct_/mutable.rs +++ b/crates/polars-arrow/src/array/struct_/mutable.rs @@ -11,19 +11,19 @@ use crate::datatypes::ArrowDataType; #[derive(Debug)] pub struct MutableStructArray { dtype: ArrowDataType, + length: usize, values: Vec>, validity: Option, } fn check( dtype: &ArrowDataType, + length: usize, values: &[Box], validity: Option, ) -> PolarsResult<()> { let fields = StructArray::try_get_fields(dtype)?; - if fields.is_empty() { - polars_bail!(ComputeError: "a StructArray must contain at least one field") - } + if fields.len() != values.len() { polars_bail!(ComputeError: "a StructArray must have a number of fields in its DataType equal to the number of child values") } @@ -43,23 +43,22 @@ fn check( } })?; - let len = values[0].len(); values .iter() - .map(|a| a.len()) + .map(|f| f.len()) .enumerate() - .try_for_each(|(index, a_len)| { - if a_len != len { + .try_for_each(|(index, f_length)| { + if f_length != length { polars_bail!(ComputeError: "The children must have an equal number of values. - However, the values at index {index} have a length of {a_len}, which is different from values at index 0, {len}." + However, the values at index {index} have a length of {f_length}, which is different from given length {length}." ) } else { Ok(()) } })?; - if validity.map_or(false, |validity| validity != len) { + if validity.map_or(false, |validity| validity != length) { polars_bail!(ComputeError: "the validity length of a StructArray must match its number of elements", ) @@ -77,6 +76,7 @@ impl From for StructArray { StructArray::new( other.dtype, + other.length, other.values.into_iter().map(|mut v| v.as_box()).collect(), validity, ) @@ -85,8 +85,8 @@ impl From for StructArray { impl MutableStructArray { /// Creates a new [`MutableStructArray`]. - pub fn new(dtype: ArrowDataType, values: Vec>) -> Self { - Self::try_new(dtype, values, None).unwrap() + pub fn new(dtype: ArrowDataType, length: usize, values: Vec>) -> Self { + Self::try_new(dtype, length, values, None).unwrap() } /// Create a [`MutableStructArray`] out of low-end APIs. @@ -97,12 +97,14 @@ impl MutableStructArray { /// * `validity` is not `None` and its length is different from the `values`'s length pub fn try_new( dtype: ArrowDataType, + length: usize, values: Vec>, validity: Option, ) -> PolarsResult { - check(&dtype, &values, validity.as_ref().map(|x| x.len()))?; + check(&dtype, length, &values, validity.as_ref().map(|x| x.len()))?; Ok(Self { dtype, + length, values, validity, }) @@ -113,26 +115,17 @@ impl MutableStructArray { self, ) -> ( ArrowDataType, + usize, Vec>, Option, ) { - (self.dtype, self.values, self.validity) - } - - /// The mutable values - pub fn mut_values(&mut self) -> &mut Vec> { - &mut self.values + (self.dtype, self.length, self.values, self.validity) } /// The values pub fn values(&self) -> &Vec> { &self.values } - - /// Return the `i`th child array. - pub fn value(&mut self, i: usize) -> Option<&mut A> { - self.values[i].as_mut_any().downcast_mut::() - } } impl MutableStructArray { @@ -155,6 +148,7 @@ impl MutableStructArray { false => self.init_validity(), }, }; + self.length += 1; } fn push_null(&mut self) { @@ -193,7 +187,7 @@ impl MutableStructArray { impl MutableArray for MutableStructArray { fn len(&self) -> usize { - self.values.first().map(|v| v.len()).unwrap_or(0) + self.length } fn validity(&self) -> Option<&MutableBitmap> { @@ -203,6 +197,7 @@ impl MutableArray for MutableStructArray { fn as_box(&mut self) -> Box { StructArray::new( self.dtype.clone(), + self.length, std::mem::take(&mut self.values) .into_iter() .map(|mut v| v.as_box()) @@ -215,6 +210,7 @@ impl MutableArray for MutableStructArray { fn as_arc(&mut self) -> Arc { StructArray::new( self.dtype.clone(), + self.length, std::mem::take(&mut self.values) .into_iter() .map(|mut v| v.as_box()) diff --git a/crates/polars-arrow/src/compute/cast/mod.rs b/crates/polars-arrow/src/compute/cast/mod.rs index 27f93eb07356..2a544fc7209f 100644 --- a/crates/polars-arrow/src/compute/cast/mod.rs +++ b/crates/polars-arrow/src/compute/cast/mod.rs @@ -94,6 +94,7 @@ fn cast_struct( Ok(StructArray::new( to_type.clone(), + array.len(), new_values, array.validity().cloned(), )) diff --git a/crates/polars-arrow/src/compute/take/structure.rs b/crates/polars-arrow/src/compute/take/structure.rs index caad9f4ee0a4..472535fe126e 100644 --- a/crates/polars-arrow/src/compute/take/structure.rs +++ b/crates/polars-arrow/src/compute/take/structure.rs @@ -30,5 +30,5 @@ pub(super) unsafe fn take_unchecked(array: &StructArray, indices: &IdxArr) -> St .validity() .map(|b| super::bitmap::take_bitmap_nulls_unchecked(b, indices)); let validity = combine_validities_and(validity.as_ref(), indices.validity()); - StructArray::new(array.dtype().clone(), values, validity) + StructArray::new(array.dtype().clone(), indices.len(), values, validity) } diff --git a/crates/polars-arrow/src/io/avro/read/deserialize.rs b/crates/polars-arrow/src/io/avro/read/deserialize.rs index f9423f83305a..dd22eff764d9 100644 --- a/crates/polars-arrow/src/io/avro/read/deserialize.rs +++ b/crates/polars-arrow/src/io/avro/read/deserialize.rs @@ -56,7 +56,8 @@ fn make_mutable( .iter() .map(|field| make_mutable(field.dtype(), None, capacity)) .collect::>>()?; - Box::new(DynMutableStructArray::new(values, dtype.clone())) as Box + Box::new(DynMutableStructArray::new(values, 0, dtype.clone())) + as Box }, other => { polars_bail!(nyi = "Deserializing type {other:#?} is still not implemented") diff --git a/crates/polars-arrow/src/io/avro/read/nested.rs b/crates/polars-arrow/src/io/avro/read/nested.rs index fc7e07487d83..17baf2ef05e7 100644 --- a/crates/polars-arrow/src/io/avro/read/nested.rs +++ b/crates/polars-arrow/src/io/avro/read/nested.rs @@ -211,14 +211,16 @@ impl MutableArray for FixedItemsUtf8Dictionary { #[derive(Debug)] pub struct DynMutableStructArray { dtype: ArrowDataType, + length: usize, values: Vec>, validity: Option, } impl DynMutableStructArray { - pub fn new(values: Vec>, dtype: ArrowDataType) -> Self { + pub fn new(values: Vec>, length: usize, dtype: ArrowDataType) -> Self { Self { dtype, + length, values, validity: None, } @@ -234,6 +236,7 @@ impl DynMutableStructArray { if let Some(validity) = &mut self.validity { validity.push(true) } + self.length += 1; Ok(()) } @@ -244,6 +247,7 @@ impl DynMutableStructArray { Some(validity) => validity.push(false), None => self.init_validity(), } + self.length += 1; } fn init_validity(&mut self) { @@ -258,7 +262,7 @@ impl DynMutableStructArray { impl MutableArray for DynMutableStructArray { fn len(&self) -> usize { - self.values[0].len() + self.length } fn validity(&self) -> Option<&MutableBitmap> { @@ -270,6 +274,7 @@ impl MutableArray for DynMutableStructArray { Box::new(StructArray::new( self.dtype.clone(), + self.length, values, std::mem::take(&mut self.validity).map(|x| x.into()), )) @@ -280,6 +285,7 @@ impl MutableArray for DynMutableStructArray { std::sync::Arc::new(StructArray::new( self.dtype.clone(), + self.length, values, std::mem::take(&mut self.validity).map(|x| x.into()), )) diff --git a/crates/polars-arrow/src/io/ipc/read/array/struct_.rs b/crates/polars-arrow/src/io/ipc/read/array/struct_.rs index 5cf68f1d1d95..e2a99d2b5e24 100644 --- a/crates/polars-arrow/src/io/ipc/read/array/struct_.rs +++ b/crates/polars-arrow/src/io/ipc/read/array/struct_.rs @@ -1,7 +1,7 @@ use std::collections::VecDeque; use std::io::{Read, Seek}; -use polars_error::{polars_err, PolarsResult}; +use polars_error::{polars_ensure, polars_err, PolarsResult}; use super::super::super::IpcField; use super::super::deserialize::{read, skip}; @@ -41,6 +41,10 @@ pub fn read_struct( )?; let fields = StructArray::get_fields(&dtype); + polars_ensure!( + !fields.is_empty(), + nyi = "Cannot read zero field structs from IPC" + ); let values = fields .iter() @@ -64,7 +68,7 @@ pub fn read_struct( }) .collect::>>()?; - StructArray::try_new(dtype, values, validity) + StructArray::try_new(dtype, values[0].len(), values, validity) } pub fn skip_struct( diff --git a/crates/polars-arrow/src/legacy/array/mod.rs b/crates/polars-arrow/src/legacy/array/mod.rs index f15ac1811f96..ed53797db9b9 100644 --- a/crates/polars-arrow/src/legacy/array/mod.rs +++ b/crates/polars-arrow/src/legacy/array/mod.rs @@ -238,7 +238,13 @@ pub fn convert_inner_type(array: &dyn Array, dtype: &ArrowDataType) -> Box>(); - StructArray::new(dtype.clone(), new_values, array.validity().cloned()).boxed() + StructArray::new( + dtype.clone(), + array.len(), + new_values, + array.validity().cloned(), + ) + .boxed() }, _ => new_null_array(dtype.clone(), array.len()), } diff --git a/crates/polars-arrow/src/legacy/conversion.rs b/crates/polars-arrow/src/legacy/conversion.rs index e2cb028036a0..0d1ef3ca2b99 100644 --- a/crates/polars-arrow/src/legacy/conversion.rs +++ b/crates/polars-arrow/src/legacy/conversion.rs @@ -5,7 +5,8 @@ use crate::types::NativeType; pub fn chunk_to_struct(chunk: RecordBatchT, fields: Vec) -> StructArray { let dtype = ArrowDataType::Struct(fields); - StructArray::new(dtype, chunk.into_arrays(), None) + dbg!("TODO: VERIFY"); + StructArray::new(dtype, chunk.len(), chunk.into_arrays(), None) } /// Returns its underlying [`Vec`], if possible. diff --git a/crates/polars-core/src/chunked_array/cast.rs b/crates/polars-core/src/chunked_array/cast.rs index 1b8228d4ea69..98ae4962b1ae 100644 --- a/crates/polars-core/src/chunked_array/cast.rs +++ b/crates/polars-core/src/chunked_array/cast.rs @@ -125,7 +125,7 @@ fn cast_single_to_struct( new_fields.push(Series::full_null(fld.name.clone(), length, &fld.dtype)); } - StructChunked::from_series(name, new_fields.iter()).map(|ca| ca.into_series()) + StructChunked::from_series(name, length, new_fields.iter()).map(|ca| ca.into_series()) } impl ChunkedArray diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs index 7b851c5def54..707fc79f0364 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs @@ -66,8 +66,9 @@ impl CategoricalChunked { let mut counts = groups.group_count(); counts.rename(PlSmallStr::from_static("counts")); + let height = counts.len(); let cols = vec![values.into_series().into(), counts.into_series().into()]; - let df = unsafe { DataFrame::new_no_checks(cols) }; + let df = unsafe { DataFrame::new_no_checks(height, cols) }; df.sort( ["counts"], SortMultipleOptions::default().with_order_descending(true), diff --git a/crates/polars-core/src/chunked_array/ops/full.rs b/crates/polars-core/src/chunked_array/ops/full.rs index e33d38118891..6285f44e4498 100644 --- a/crates/polars-core/src/chunked_array/ops/full.rs +++ b/crates/polars-core/src/chunked_array/ops/full.rs @@ -207,8 +207,7 @@ impl ListChunked { #[cfg(feature = "dtype-struct")] impl ChunkFullNull for StructChunked { fn full_null(name: PlSmallStr, length: usize) -> StructChunked { - let s = [Series::new_null(PlSmallStr::EMPTY, length)]; - StructChunked::from_series(name, s.iter()) + StructChunked::from_series(name, length, [].iter()) .unwrap() .with_outer_validity(Some(Bitmap::new_zeroed(length))) } diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index 061278a22cc9..33f43d530e45 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -11,6 +11,7 @@ mod apply; mod approx_n_unique; pub mod arity; mod bit_repr; +mod bits; #[cfg(feature = "bitwise")] mod bitwise_reduce; pub(crate) mod chunkops; @@ -576,7 +577,7 @@ impl ChunkExpandAtIndex for StructChunked { }) .collect::>(); - StructArray::new(chunk.dtype().clone(), values, None).boxed() + StructArray::new(chunk.dtype().clone(), length, values, None).boxed() }; // SAFETY: chunks are from self. diff --git a/crates/polars-core/src/chunked_array/ops/sort/mod.rs b/crates/polars-core/src/chunked_array/ops/sort/mod.rs index 0aa70dae1c83..d68583f1dc5c 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/mod.rs @@ -724,7 +724,8 @@ pub(crate) fn convert_sort_column_multi_sort(s: &Series) -> PolarsResult .iter() .map(convert_sort_column_multi_sort) .collect::>>()?; - let mut out = StructChunked::from_series(ca.name().clone(), new_fields.iter())?; + let mut out = + StructChunked::from_series(ca.name().clone(), ca.len(), new_fields.iter())?; out.zip_outer_validity(ca); out.into_series() }, diff --git a/crates/polars-core/src/chunked_array/ops/zip.rs b/crates/polars-core/src/chunked_array/ops/zip.rs index 61f530280324..e46ee77dc4b4 100644 --- a/crates/polars-core/src/chunked_array/ops/zip.rs +++ b/crates/polars-core/src/chunked_array/ops/zip.rs @@ -294,7 +294,7 @@ impl ChunkZip for StructChunked { .map(|(lhs, rhs)| lhs.zip_with_same_type(&mask, &rhs)) .collect::>>()?; - let mut out = StructChunked::from_series(self.name().clone(), fields.iter())?; + let mut out = StructChunked::from_series(self.name().clone(), length, fields.iter())?; fn rechunk_bitmaps( total_length: usize, diff --git a/crates/polars-core/src/chunked_array/random.rs b/crates/polars-core/src/chunked_array/random.rs index 1ad3d2b7abd7..b927cdc8f9ee 100644 --- a/crates/polars-core/src/chunked_array/random.rs +++ b/crates/polars-core/src/chunked_array/random.rs @@ -192,10 +192,7 @@ impl DataFrame { match n.get(0) { Some(n) => self.sample_n_literal(n as usize, with_replacement, shuffle, seed), - None => { - let new_cols = self.columns.iter().map(Column::clear).collect_trusted(); - Ok(unsafe { DataFrame::new_no_checks(new_cols) }) - }, + None => Ok(self.clear()), } } @@ -237,10 +234,7 @@ impl DataFrame { let n = (self.height() as f64 * frac) as usize; self.sample_n_literal(n, with_replacement, shuffle, seed) }, - None => { - let new_cols = self.columns.iter().map(Column::clear).collect_trusted(); - Ok(unsafe { DataFrame::new_no_checks(new_cols) }) - }, + None => Ok(self.clear()), } } } diff --git a/crates/polars-core/src/chunked_array/struct_/frame.rs b/crates/polars-core/src/chunked_array/struct_/frame.rs index 83f0f1299667..b175b3a04832 100644 --- a/crates/polars-core/src/chunked_array/struct_/frame.rs +++ b/crates/polars-core/src/chunked_array/struct_/frame.rs @@ -5,6 +5,6 @@ use crate::prelude::StructChunked; impl DataFrame { pub fn into_struct(self, name: PlSmallStr) -> StructChunked { - StructChunked::from_columns(name, &self.columns).expect("same invariants") + StructChunked::from_columns(name, self.height(), &self.columns).expect("same invariants") } } diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index 0c4eb50ddc58..e2914dd823d5 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -20,12 +20,22 @@ pub type StructChunked = ChunkedArray; fn constructor<'a, I: ExactSizeIterator + Clone>( name: PlSmallStr, + length: usize, fields: I, ) -> PolarsResult { + if fields.len() == 0 { + let dtype = DataType::Struct(Vec::new()); + let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest()); + let chunks = vec![StructArray::new(arrow_dtype, length, Vec::new(), None).boxed()]; + + // SAFETY: invariants checked above. + return Ok(unsafe { StructChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) }); + } + // Different chunk lengths: rechunk and recurse. if !fields.clone().map(|s| s.n_chunks()).all_equal() { let fields = fields.map(|s| s.rechunk()).collect::>(); - return constructor(name, fields.iter()); + return constructor(name, length, fields.iter()); } let n_chunks = fields.clone().next().unwrap().n_chunks(); @@ -39,11 +49,11 @@ fn constructor<'a, I: ExactSizeIterator + Clone>( .map(|field| field.chunks()[c_i].clone()) .collect::>(); - if !fields.iter().map(|arr| arr.len()).all_equal() { + if !fields.iter().all(|arr| length == arr.len()) { return Err(()); } - Ok(StructArray::new(arrow_dtype.clone(), fields, None).boxed()) + Ok(StructArray::new(arrow_dtype.clone(), length, fields, None).boxed()) }) .collect::, ()>>(); @@ -59,40 +69,44 @@ fn constructor<'a, I: ExactSizeIterator + Clone>( // Different chunk lengths: rechunk and recurse. Err(_) => { let fields = fields.map(|s| s.rechunk()).collect::>(); - constructor(name, fields.iter()) + constructor(name, length, fields.iter()) }, } } impl StructChunked { - pub fn from_columns(name: PlSmallStr, fields: &[Column]) -> PolarsResult { - Self::from_series(name, fields.iter().map(|c| c.as_materialized_series())) + pub fn from_columns(name: PlSmallStr, length: usize, fields: &[Column]) -> PolarsResult { + Self::from_series( + name, + length, + fields.iter().map(|c| c.as_materialized_series()), + ) } pub fn from_series<'a, I: ExactSizeIterator + Clone>( name: PlSmallStr, + length: usize, fields: I, ) -> PolarsResult { let mut names = PlHashSet::with_capacity(fields.len()); - let first_len = fields.clone().next().map(|s| s.len()).unwrap_or(0); - let mut max_len = first_len; - let mut all_equal_len = true; - let mut is_empty = false; + let mut needs_to_broadcast = false; for s in fields.clone() { let s_len = s.len(); - max_len = std::cmp::max(max_len, s_len); - if s_len != first_len { - all_equal_len = false; - } - if s_len == 0 { - is_empty = true; + if s_len != length && s_len != 1 { + polars_bail!( + ShapeMismatch: "expected struct fields to have given length. given = {length}, field length = {s_len}." + ); } + + needs_to_broadcast |= length != 1 && s_len == 1; + polars_ensure!( names.insert(s.name()), Duplicate: "multiple fields with name '{}' found", s.name() ); + match s.dtype() { #[cfg(feature = "object")] DataType::Object(_, _) => { @@ -102,29 +116,25 @@ impl StructChunked { } } - if !all_equal_len { - let mut new_fields = Vec::with_capacity(fields.len()); - for s in fields { - let s_len = s.len(); - if is_empty { - new_fields.push(s.clear()) - } else if s_len == max_len { - new_fields.push(s.clone()) - } else if s_len == 1 { - new_fields.push(s.new_from_index(0, max_len)) + if !needs_to_broadcast { + return constructor(name, length, fields); + } + + if length == 0 { + let new_fields = fields.map(|s| s.clear()).collect::>(); + return constructor(name, length, new_fields.iter()); + } + + let new_fields = fields + .map(|s| { + if s.len() == length { + s.clone() } else { - polars_bail!( - ShapeMismatch: "expected all fields to have equal length" - ); + s.new_from_index(0, length) } - } - constructor(name, new_fields.iter()) - } else if fields.len() == 0 { - let fields = [Series::new_null(PlSmallStr::EMPTY, 0)]; - constructor(name, fields.iter()) - } else { - constructor(name, fields) - } + }) + .collect::>(); + constructor(name, length, new_fields.iter()) } pub fn struct_fields(&self) -> &[Field] { @@ -185,7 +195,8 @@ impl StructChunked { }) .collect::>>()?; - let mut out = Self::from_series(self.name().clone(), new_fields.iter())?; + let mut out = + Self::from_series(self.name().clone(), struct_len, new_fields.iter())?; if self.null_count > 0 { out.zip_outer_validity(self); } @@ -241,7 +252,7 @@ impl StructChunked { } }) .collect::>>()?; - let mut out = Self::from_series(self.name().clone(), fields.iter())?; + let mut out = Self::from_series(self.name().clone(), self.len(), fields.iter())?; if self.null_count > 0 { out.zip_outer_validity(self); } @@ -286,7 +297,7 @@ impl StructChunked { .iter() .map(func) .collect::>>()?; - Self::from_series(self.name().clone(), fields.iter()).map(|mut ca| { + Self::from_series(self.name().clone(), self.len(), fields.iter()).map(|mut ca| { if self.null_count > 0 { // SAFETY: we don't change types/ lengths. unsafe { @@ -361,10 +372,12 @@ impl StructChunked { .fields_as_series() .into_iter() .map(Column::from) - .collect(); + .collect::>(); + + let height = DataFrame::infer_height(&columns); // SAFETY: invariants for struct are the same - unsafe { DataFrame::new_no_checks(columns) } + unsafe { DataFrame::new_no_checks(height, columns) } } /// Get access to one of this `[StructChunked]`'s fields diff --git a/crates/polars-core/src/frame/arithmetic.rs b/crates/polars-core/src/frame/arithmetic.rs index 6d184b2960c9..cf6683d0a57c 100644 --- a/crates/polars-core/src/frame/arithmetic.rs +++ b/crates/polars-core/src/frame/arithmetic.rs @@ -25,7 +25,7 @@ macro_rules! impl_arithmetic { .map(|s| s.map(Column::from)) .collect::>() })?; - Ok(unsafe { DataFrame::new_no_checks(cols) }) + Ok(unsafe { DataFrame::new_no_checks($self.height(), cols) }) }}; } diff --git a/crates/polars-core/src/frame/chunks.rs b/crates/polars-core/src/frame/chunks.rs index 16fa8f7c1ff9..801c0d11b9c8 100644 --- a/crates/polars-core/src/frame/chunks.rs +++ b/crates/polars-core/src/frame/chunks.rs @@ -33,7 +33,8 @@ impl DataFrame { .map(Column::from) .collect::>(); - DataFrame::new_no_checks(columns) + let height = Self::infer_height(&columns); + DataFrame::new_no_checks(height, columns) }) } diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index 74b6f0e7da0a..6181cdd7204f 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -696,7 +696,7 @@ impl Column { pub fn into_frame(self) -> DataFrame { // SAFETY: A single-column dataframe cannot have length mismatches or duplicate names - unsafe { DataFrame::new_no_checks(vec![self]) } + unsafe { DataFrame::new_no_checks(self.len(), vec![self]) } } pub fn unique_stable(&self) -> PolarsResult { diff --git a/crates/polars-core/src/frame/from.rs b/crates/polars-core/src/frame/from.rs index 5ec5d98a1597..e0282f4275fb 100644 --- a/crates/polars-core/src/frame/from.rs +++ b/crates/polars-core/src/frame/from.rs @@ -4,7 +4,7 @@ impl TryFrom for DataFrame { type Error = PolarsError; fn try_from(arr: StructArray) -> PolarsResult { - let (fld, arrs, nulls) = arr.into_data(); + let (fld, _length, arrs, nulls) = arr.into_data(); polars_ensure!( nulls.is_none(), ComputeError: "cannot deserialize struct with nulls into a DataFrame" diff --git a/crates/polars-core/src/frame/group_by/mod.rs b/crates/polars-core/src/frame/group_by/mod.rs index 89c72f5a0eac..9c56f7c49122 100644 --- a/crates/polars-core/src/frame/group_by/mod.rs +++ b/crates/polars-core/src/frame/group_by/mod.rs @@ -795,7 +795,7 @@ impl<'df> GroupBy<'df> { new_cols.extend_from_slice(&self.selected_keys); let cols = self.df.select_columns_impl(agg.as_slice())?; new_cols.extend(cols); - Ok(unsafe { DataFrame::new_no_checks(new_cols) }) + Ok(unsafe { DataFrame::new_no_checks(self.df.height(), new_cols) }) } } else { Ok(self.df.clone()) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index a06c5856a5f1..7f7b0d8be6a3 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -13,6 +13,7 @@ use crate::prelude::*; #[cfg(feature = "row_hash")] use crate::utils::split_df; use crate::utils::{slice_offsets, try_get_supertype, Container, NoNull}; +use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH}; #[cfg(feature = "dataframe_arithmetic")] mod arithmetic; @@ -170,7 +171,8 @@ where /// ``` #[derive(Clone)] pub struct DataFrame { - // invariant: Column.len() is the same for each column + height: usize, + // invariant: columns[i].len() == height for each 0 >= i > columns.len() pub(crate) columns: Vec, } @@ -286,18 +288,20 @@ impl DataFrame { pub fn new(columns: Vec) -> PolarsResult { ensure_names_unique(&columns, |s| s.name().as_str())?; - if columns.len() > 1 { - let first_len = columns[0].len(); - for col in &columns { - polars_ensure!( - col.len() == first_len, - ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}", - columns[0].len(), first_len, col.name(), col.len() - ); - } + let Some(fst) = columns.first() else { + return Ok(DataFrame { height: 0, columns }); + }; + + let height = fst.len(); + for col in &columns[1..] { + polars_ensure!( + col.len() == height, + ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}", + columns[0].len(), height, col.name(), col.len() + ); } - Ok(DataFrame { columns }) + Ok(DataFrame { height, columns }) } /// Converts a sequence of columns into a DataFrame, broadcasting length-1 @@ -338,7 +342,10 @@ impl DataFrame { *col = col.new_from_index(0, broadcast_len); } } - Ok(unsafe { DataFrame::new_no_checks(columns) }) + + let length = if columns.is_empty() { 0 } else { broadcast_len }; + + Ok(unsafe { DataFrame::new_no_checks(length, columns) }) } /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers). @@ -350,7 +357,10 @@ impl DataFrame { /// static EMPTY: DataFrame = DataFrame::empty(); /// ``` pub const fn empty() -> Self { - DataFrame { columns: vec![] } + DataFrame { + height: 0, + columns: vec![], + } } /// Create an empty `DataFrame` with empty columns as per the `schema`. @@ -359,7 +369,7 @@ impl DataFrame { .iter() .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype))) .collect(); - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks(0, cols) } } /// Create an empty `DataFrame` with empty columns as per the `schema`. @@ -368,7 +378,7 @@ impl DataFrame { .iter_values() .map(|fld| Column::from(Series::new_empty(fld.name.clone(), &(fld.dtype().into())))) .collect(); - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks(0, cols) } } /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty. @@ -453,7 +463,17 @@ impl DataFrame { self } - /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the `Series`. + /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the + /// `Series`. + /// + /// Calculates the height from the first column or `0` if no columns are given. + pub unsafe fn new_no_checks_height_from_first(columns: Vec) -> DataFrame { + let height = columns.first().map_or(0, Column::len); + unsafe { Self::new_no_checks(height, columns) } + } + + /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the + /// `Series`. /// /// It is advised to use [DataFrame::new] in favor of this method. /// @@ -461,23 +481,31 @@ impl DataFrame { /// /// It is the callers responsibility to uphold the contract of all `Series` /// having an equal length and a unique name, if not this may panic down the line. - pub unsafe fn new_no_checks(columns: Vec) -> DataFrame { - #[cfg(debug_assertions)] - { - Self::new(columns).unwrap() - } - #[cfg(not(debug_assertions))] - { - Self::_new_no_checks_impl(columns) + pub unsafe fn new_no_checks(height: usize, columns: Vec) -> DataFrame { + if cfg!(debug_assertions) { + ensure_names_unique(&columns, |s| s.name().as_str()).unwrap(); + + for col in &columns { + assert_eq!(col.len(), height); + } } + + unsafe { Self::_new_no_checks_impl(height, columns) } } /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame /// constructed with this method is generally highly unsafe and should not be long-lived. #[allow(clippy::missing_safety_doc)] - pub const unsafe fn _new_no_checks_impl(columns: Vec) -> DataFrame { - DataFrame { columns } + pub unsafe fn _new_no_checks_impl(height: usize, columns: Vec) -> DataFrame { + if cfg!(debug_assertions) && std::env::var("POLARS_VERIFY_COL_LENGTHS").is_ok() { + dbg!("Make const"); + for col in &columns { + assert_eq!(col.len(), height); + } + } + + DataFrame { height, columns } } /// Create a new `DataFrame` but does not check the length of the `Series`, @@ -492,15 +520,11 @@ impl DataFrame { pub unsafe fn new_no_length_checks(columns: Vec) -> PolarsResult { ensure_names_unique(&columns, |s| s.name().as_str())?; - Ok({ - #[cfg(debug_assertions)] - { - Self::new(columns).unwrap() - } - #[cfg(not(debug_assertions))] - { - DataFrame { columns } - } + Ok(if cfg!(debug_assertions) { + Self::new(columns).unwrap() + } else { + let height = Self::infer_height(&columns); + DataFrame { height, columns } }) } @@ -642,11 +666,31 @@ impl DataFrame { /// Get mutable access to the underlying columns. /// /// # Safety - /// The caller must ensure the length of all [`Series`] remains equal. + /// + /// The caller must ensure the length of all [`Series`] remains equal to `height` or + /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`. pub unsafe fn get_columns_mut(&mut self) -> &mut Vec { &mut self.columns } + #[inline] + /// Remove all the columns in the [`DataFrame`] but keep the `height`. + pub fn clear_columns(&mut self) { + unsafe { self.get_columns_mut() }.clear() + } + + #[inline] + /// Extend the columns without checking for name collisions or height. + pub unsafe fn column_extend_unchecked(&mut self, iter: impl Iterator) { + if cfg!(debug_assertions) { + for c in iter { + unsafe { self.with_column_unchecked(c) }; + } + } else { + unsafe { self.get_columns_mut() }.extend(iter) + } + } + /// Take ownership of the underlying columns vec. pub fn take_columns(self) -> Vec { self.columns @@ -806,10 +850,7 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn shape(&self) -> (usize, usize) { - match self.columns.as_slice() { - &[] => (0, 0), - v => (v[0].len(), v.len()), - } + (self.height, self.columns.len()) } /// Get the width of the [`DataFrame`] which is the number of columns. @@ -872,7 +913,16 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn is_empty(&self) -> bool { - self.height() == 0 + matches!(self.shape(), (0, _) | (_, 0)) + } + + /// Set the height (i.e. number of rows) of this [`DataFrame`]. + /// + /// # Safety + /// + /// This needs to be equal to the length of all the columns. + pub unsafe fn set_height(&mut self, height: usize) { + self.height = height; } /// Add multiple [`Series`] to a [`DataFrame`]. @@ -1020,6 +1070,7 @@ impl DataFrame { left.append(right)?; Ok(()) })?; + self.height += other.height; Ok(self) } @@ -1036,6 +1087,7 @@ impl DataFrame { .for_each(|(left, right)| { left.append(right).expect("should not fail"); }); + self.height += other.height; } /// Extend the memory backed by this [`DataFrame`] with the values from `other`. @@ -1059,6 +1111,7 @@ impl DataFrame { "unable to extend a DataFrame of width {} with a DataFrame of width {}", self.width(), other.width(), ); + self.columns .iter_mut() .zip(other.columns.iter()) @@ -1066,7 +1119,9 @@ impl DataFrame { ensure_can_extend(&*left, right)?; left.extend(right)?; Ok(()) - }) + })?; + self.height += other.height; + Ok(()) } /// Remove a column by name and return the column removed. @@ -1173,7 +1228,7 @@ impl DataFrame { } }); - Ok(unsafe { DataFrame::new_no_checks(new_cols) }) + Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) }) } /// Drop columns that are in `names`. @@ -1198,7 +1253,7 @@ impl DataFrame { } }); - unsafe { DataFrame::new_no_checks(new_cols) } + unsafe { DataFrame::new_no_checks(self.height(), new_cols) } } /// Insert a new column at a given index without checking for duplicates. @@ -1213,6 +1268,11 @@ impl DataFrame { ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}", column.len(), self.height(), ); + + if self.width() == 0 { + self.height = column.len(); + } + self.columns.insert(index, column); Ok(self) } @@ -1232,6 +1292,10 @@ impl DataFrame { if let Some(idx) = self.get_column_index(column.name().as_str()) { self.replace_column(idx, column)?; } else { + if self.width() == 0 { + self.height = column.len(); + } + self.columns.push(column); } Ok(()) @@ -1274,7 +1338,13 @@ impl DataFrame { debug_assert!(self.width() == 0 || self.height() == column.len()); debug_assert!(self.get_column_index(column.name().as_str()).is_none()); + // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height + // properly for `width` == 0. + if self.width() == 0 { + unsafe { self.set_height(column.len()) }; + } unsafe { self.get_columns_mut() }.push(column); + self } @@ -1288,6 +1358,10 @@ impl DataFrame { self.replace_column(idx, c)?; } } else { + if self.width() == 0 { + self.height = c.len(); + } + self.columns.push(c); } Ok(()) @@ -1393,14 +1467,6 @@ impl DataFrame { self.columns.get(idx) } - /// Select a mutable series by index. - /// - /// *Note: the length of the Series should remain the same otherwise the DataFrame is invalid.* - /// For this reason the method is not public - fn select_at_idx_mut(&mut self, idx: usize) -> Option<&mut Column> { - self.columns.get_mut(idx) - } - /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`] /// /// # Examples @@ -1559,7 +1625,7 @@ impl DataFrame { pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult { let selected = self.select_columns_impl(cols)?; - Ok(unsafe { DataFrame::new_no_checks(selected) }) + Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) }) } /// Select with a known schema. @@ -1596,7 +1662,7 @@ impl DataFrame { ensure_names_unique(cols, |s| s.as_str())?; } let selected = self.select_columns_impl_with_schema(cols, schema)?; - Ok(unsafe { DataFrame::new_no_checks(selected) }) + Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) }) } /// A non generic implementation to reduce compiler bloat. @@ -1625,7 +1691,7 @@ impl DataFrame { fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult { ensure_names_unique(cols, |s| s.as_str())?; let selected = self.select_columns_physical_impl(cols)?; - Ok(unsafe { DataFrame::new_no_checks(selected) }) + Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) }) } /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`]. @@ -1701,13 +1767,14 @@ impl DataFrame { Ok(selected) } - /// Select a mutable series by name. - /// *Note: the length of the Series should remain the same otherwise the DataFrame is invalid.* - /// For this reason the method is not public - fn select_mut(&mut self, name: &str) -> Option<&mut Column> { - let opt_idx = self.get_column_index(name); + fn filter_height(filtered: &[Column], mask: &BooleanChunked) -> usize { + // If there is a filtered column just see how many columns there are left. + if let Some(fst) = filtered.first() { + return fst.len(); + } - opt_idx.and_then(|idx| self.select_at_idx_mut(idx)) + // Otherwise, count the number of values that would be filtered and return that height. + mask.num_trues() } /// Take the [`DataFrame`] rows by a boolean mask. @@ -1723,13 +1790,17 @@ impl DataFrame { /// ``` pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult { let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?; - Ok(unsafe { DataFrame::new_no_checks(new_col) }) + let height = Self::filter_height(&new_col, mask); + + Ok(unsafe { DataFrame::new_no_checks(height, new_col) }) } /// Same as `filter` but does not parallelize. pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult { let new_col = self.try_apply_columns(&|s| s.filter(mask))?; - Ok(unsafe { DataFrame::new_no_checks(new_col) }) + let height = Self::filter_height(&new_col, mask); + + Ok(unsafe { DataFrame::new_no_checks(height, new_col) }) } /// Take [`DataFrame`] rows by index values. @@ -1746,7 +1817,7 @@ impl DataFrame { pub fn take(&self, indices: &IdxCa) -> PolarsResult { let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?; - Ok(unsafe { DataFrame::new_no_checks(new_col) }) + Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) }) } /// # Safety @@ -1766,7 +1837,7 @@ impl DataFrame { .map(Column::from) .collect() }; - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks(idx.len(), cols) } } pub(crate) unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self { @@ -1782,7 +1853,7 @@ impl DataFrame { .map(Column::from) .collect() }; - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks(idx.len(), cols) } } /// Rename a column in the [`DataFrame`]. @@ -1805,7 +1876,9 @@ impl DataFrame { self.columns.iter().all(|c| c.name() != &name), Duplicate: "column rename attempted with already existing name \"{name}\"" ); - self.select_mut(column) + + self.get_column_index(column) + .and_then(|idx| self.columns.get_mut(idx)) .ok_or_else(|| polars_err!(col_not_found = column)) .map(|c| c.rename(name))?; Ok(self) @@ -1987,20 +2060,23 @@ impl DataFrame { } unsafe { - DataFrame::new_no_checks(vec![ - column_names.finish().into_column(), - repr_ca.finish().into_column(), - sorted_asc_ca.finish().into_column(), - sorted_dsc_ca.finish().into_column(), - fast_explode_list_ca.finish().into_column(), - min_value_ca.finish().into_column(), - max_value_ca.finish().into_column(), - IdxCa::from_slice_options( - PlSmallStr::from_static("distinct_count"), - &distinct_count_ca[..], - ) - .into_column(), - ]) + DataFrame::new_no_checks( + self.width(), + vec![ + column_names.finish().into_column(), + repr_ca.finish().into_column(), + sorted_asc_ca.finish().into_column(), + sorted_dsc_ca.finish().into_column(), + fast_explode_list_ca.finish().into_column(), + min_value_ca.finish().into_column(), + max_value_ca.finish().into_column(), + IdxCa::from_slice_options( + PlSmallStr::from_static("distinct_count"), + &distinct_count_ca[..], + ) + .into_column(), + ], + ) } } @@ -2392,20 +2468,30 @@ impl DataFrame { .iter() .map(|s| s.slice(offset, length)) .collect::>(); - unsafe { DataFrame::new_no_checks(col) } + + let height = if let Some(fst) = col.first() { + fst.len() + } else { + length + }; + + unsafe { DataFrame::new_no_checks(height, col) } } /// Split [`DataFrame`] at the given `offset`. pub fn split_at(&self, offset: i64) -> (Self, Self) { let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip(); - let a = unsafe { DataFrame::new_no_checks(a) }; - let b = unsafe { DataFrame::new_no_checks(b) }; + + let (idx, _) = slice_offsets(offset, 0, self.height()); + + let a = unsafe { DataFrame::new_no_checks(idx, a) }; + let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) }; (a, b) } pub fn clear(&self) -> Self { let col = self.columns.iter().map(|s| s.clear()).collect::>(); - unsafe { DataFrame::new_no_checks(col) } + unsafe { DataFrame::new_no_checks(0, col) } } #[must_use] @@ -2415,7 +2501,7 @@ impl DataFrame { } // @scalar-opt let columns = self._apply_columns_par(&|s| s.slice(offset, length)); - unsafe { DataFrame::new_no_checks(columns) } + unsafe { DataFrame::new_no_checks(length, columns) } } #[must_use] @@ -2429,7 +2515,7 @@ impl DataFrame { out.shrink_to_fit(); out }); - unsafe { DataFrame::new_no_checks(columns) } + unsafe { DataFrame::new_no_checks(length, columns) } } /// Get the head of the [`DataFrame`]. @@ -2472,7 +2558,10 @@ impl DataFrame { .iter() .map(|c| c.head(length)) .collect::>(); - unsafe { DataFrame::new_no_checks(col) } + + let height = length.unwrap_or(HEAD_DEFAULT_LENGTH); + let height = usize::min(height, self.height()); + unsafe { DataFrame::new_no_checks(height, col) } } /// Get the tail of the [`DataFrame`]. @@ -2512,7 +2601,10 @@ impl DataFrame { .iter() .map(|c| c.tail(length)) .collect::>(); - unsafe { DataFrame::new_no_checks(col) } + + let height = length.unwrap_or(TAIL_DEFAULT_LENGTH); + let height = usize::min(height, self.height()); + unsafe { DataFrame::new_no_checks(height, col) } } /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches. @@ -2568,7 +2660,7 @@ impl DataFrame { #[must_use] pub fn reverse(&self) -> Self { let col = self.columns.iter().map(|s| s.reverse()).collect::>(); - unsafe { DataFrame::new_no_checks(col) } + unsafe { DataFrame::new_no_checks(self.height(), col) } } /// Shift the values by a given period and fill the parts that will be empty due to this operation @@ -2578,7 +2670,7 @@ impl DataFrame { #[must_use] pub fn shift(&self, periods: i64) -> Self { let col = self._apply_columns_par(&|s| s.shift(periods)); - unsafe { DataFrame::new_no_checks(col) } + unsafe { DataFrame::new_no_checks(self.height(), col) } } /// Replace None values with one of the following strategies: @@ -2592,7 +2684,7 @@ impl DataFrame { pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?; - Ok(unsafe { DataFrame::new_no_checks(col) }) + Ok(unsafe { DataFrame::new_no_checks(self.height(), col) }) } /// Aggregate the column horizontally to their min values. @@ -2735,7 +2827,7 @@ impl DataFrame { }) .cloned() .collect(); - let numeric_df = unsafe { DataFrame::_new_no_checks_impl(columns) }; + let numeric_df = unsafe { DataFrame::_new_no_checks_impl(self.height(), columns) }; let sum = || numeric_df.sum_horizontal(null_strategy); @@ -2933,7 +3025,9 @@ impl DataFrame { return df.filter(&mask); }, }; - Ok(unsafe { DataFrame::new_no_checks(columns) }) + + let height = Self::infer_height(&columns); + Ok(unsafe { DataFrame::new_no_checks(height, columns) }) } /// Get a mask of all the unique rows in the [`DataFrame`]. @@ -2994,7 +3088,7 @@ impl DataFrame { .iter() .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize])) .collect(); - unsafe { Self::new_no_checks(cols) } + unsafe { Self::new_no_checks(1, cols) } } /// Hash and combine the row values @@ -3176,6 +3270,10 @@ impl DataFrame { } DataFrame::new(new_cols) } + + pub(crate) fn infer_height(cols: &[Column]) -> usize { + cols.first().map_or(0, Column::len) + } } pub struct RecordBatchIter<'a> { diff --git a/crates/polars-core/src/frame/row/av_buffer.rs b/crates/polars-core/src/frame/row/av_buffer.rs index 5d8da9c55666..03cac11538d5 100644 --- a/crates/polars-core/src/frame/row/av_buffer.rs +++ b/crates/polars-core/src/frame/row/av_buffer.rs @@ -547,6 +547,7 @@ impl<'a> AnyValueBufferTrusted<'a> { } } + /// Clear `self` and give `capacity`, returning the old contents as a [`Series`]. pub fn reset(&mut self, capacity: usize) -> Series { use AnyValueBufferTrusted::*; match self { @@ -616,15 +617,33 @@ impl<'a> AnyValueBufferTrusted<'a> { }, #[cfg(feature = "dtype-struct")] Struct(b) => { + // @Q? Maybe we need to add a length parameter here for ZFS's. I am not very happy + // with just setting the length to zero for that case. + if b.is_empty() { + return StructChunked::from_series(PlSmallStr::EMPTY, 0, [].iter()) + .unwrap() + .into_series(); + } + + let mut min_len = usize::MAX; + let mut max_len = usize::MIN; + let v = b .iter_mut() .map(|(b, name)| { let mut s = b.reset(capacity); + + min_len = min_len.min(s.len()); + max_len = max_len.max(s.len()); + s.rename(name.clone()); s }) .collect::>(); - StructChunked::from_series(PlSmallStr::EMPTY, v.iter()) + + let length = if min_len == 0 { 0 } else { max_len }; + + StructChunked::from_series(PlSmallStr::EMPTY, length, v.iter()) .unwrap() .into_series() }, diff --git a/crates/polars-core/src/frame/row/transpose.rs b/crates/polars-core/src/frame/row/transpose.rs index 0f41bb2749d5..6910dfc7800c 100644 --- a/crates/polars-core/src/frame/row/transpose.rs +++ b/crates/polars-core/src/frame/row/transpose.rs @@ -84,7 +84,7 @@ impl DataFrame { })); }, }; - Ok(unsafe { DataFrame::new_no_checks(cols_t) }) + Ok(unsafe { DataFrame::new_no_checks(new_height, cols_t) }) } pub fn transpose( diff --git a/crates/polars-core/src/functions.rs b/crates/polars-core/src/functions.rs index 50ce5d14e491..ebc5006e0493 100644 --- a/crates/polars-core/src/functions.rs +++ b/crates/polars-core/src/functions.rs @@ -38,7 +38,7 @@ pub fn concat_df_diagonal(dfs: &[DataFrame]) -> PolarsResult { None => columns.push(Column::full_null(name.clone(), height, dtype)), } } - unsafe { DataFrame::new_no_checks(columns) } + unsafe { DataFrame::new_no_checks(height, columns) } }) .collect::>(); diff --git a/crates/polars-core/src/serde/series.rs b/crates/polars-core/src/serde/series.rs index 0ef07e702374..b9a70f44cbd2 100644 --- a/crates/polars-core/src/serde/series.rs +++ b/crates/polars-core/src/serde/series.rs @@ -277,7 +277,13 @@ impl<'de> Deserialize<'de> for Series { #[cfg(feature = "dtype-struct")] DataType::Struct(_) => { let values: Vec = map.next_value()?; - let ca = StructChunked::from_series(name.clone(), values.iter()).unwrap(); + dbg!("TODO: Fix"); + let ca = StructChunked::from_series( + name.clone(), + values[0].len(), + values.iter(), + ) + .unwrap(); let mut s = ca.into_series(); s.rename(name); Ok(s) diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index 0ca52862b3b9..8f669132dc49 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -657,20 +657,17 @@ fn any_values_to_list( DataType::Categorical(Some(Arc::new(RevMapping::default())), *ordering) }, - // Structs don't support empty fields yet. - // We must ensure the data-types match what we do physical - #[cfg(feature = "dtype-struct")] - DataType::Struct(fields) if fields.is_empty() => { - DataType::Struct(vec![Field::new(PlSmallStr::EMPTY, DataType::Null)]) - }, - _ => inner_type.clone(), }; + dbg!(&list_inner_type); + let mut builder = get_list_builder(&list_inner_type, capacity * 5, capacity, PlSmallStr::EMPTY)?; for av in avs { + dbg!(&av); + match av { AnyValue::List(b) => match b.cast(inner_type) { Ok(casted) => { @@ -832,7 +829,9 @@ fn any_values_to_struct( ) -> PolarsResult { // Fast path for structs with no fields. if fields.is_empty() { - return Ok(StructChunked::full_null(PlSmallStr::EMPTY, values.len()).into_series()); + return Ok( + StructChunked::from_series(PlSmallStr::EMPTY, values.len(), [].iter())?.into_series(), + ); } // The physical series fields of the struct. @@ -873,7 +872,8 @@ fn any_values_to_struct( series_fields.push(s) } - let mut out = StructChunked::from_series(PlSmallStr::EMPTY, series_fields.iter())?; + let mut out = + StructChunked::from_series(PlSmallStr::EMPTY, values.len(), series_fields.iter())?; if has_outer_validity { let mut validity = MutableBitmap::new(); validity.extend_constant(values.len(), true); diff --git a/crates/polars-core/src/series/from.rs b/crates/polars-core/src/series/from.rs index 7f61f99895f4..c53fb9dfc582 100644 --- a/crates/polars-core/src/series/from.rs +++ b/crates/polars-core/src/series/from.rs @@ -598,6 +598,7 @@ unsafe fn to_physical_and_dtype( .collect(); let arrow_array = Box::new(StructArray::new( ArrowDataType::Struct(arrow_fields), + arr.len(), values, arr.validity().cloned(), )) as ArrayRef; diff --git a/crates/polars-core/src/series/into.rs b/crates/polars-core/src/series/into.rs index 1213c3346525..914450034cf4 100644 --- a/crates/polars-core/src/series/into.rs +++ b/crates/polars-core/src/series/into.rs @@ -44,7 +44,13 @@ impl Series { s.to_arrow(0, compat_level) }) .collect::>(); - StructArray::new(dt.to_arrow(compat_level), values, arr.validity().cloned()).boxed() + StructArray::new( + dt.to_arrow(compat_level), + ca.len(), + values, + arr.validity().cloned(), + ) + .boxed() }, // special list branch to // make sure that we recursively apply all logical types. diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 72cb3b67dc41..35f5b1258b19 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -257,7 +257,7 @@ impl Series { pub fn into_frame(self) -> DataFrame { // SAFETY: A single-column dataframe cannot have length mismatches or duplicate names - unsafe { DataFrame::new_no_checks(vec![self.into()]) } + unsafe { DataFrame::new_no_checks(self.len(), vec![self.into()]) } } /// Rename series. @@ -632,7 +632,8 @@ impl Series { .map(|s| s.to_physical_repr().into_owned()) .collect(); let mut ca = - StructChunked::from_series(self.name().clone(), fields.iter()).unwrap(); + StructChunked::from_series(self.name().clone(), arr.len(), fields.iter()) + .unwrap(); if arr.null_count() > 0 { ca.zip_outer_validity(arr); diff --git a/crates/polars-core/src/series/ops/null.rs b/crates/polars-core/src/series/ops/null.rs index 78e8fb795f27..3a708a404a82 100644 --- a/crates/polars-core/src/series/ops/null.rs +++ b/crates/polars-core/src/series/ops/null.rs @@ -57,7 +57,7 @@ impl Series { .iter() .map(|fld| Series::full_null(fld.name().clone(), size, fld.dtype())) .collect::>(); - let ca = StructChunked::from_series(name, fields.iter()).unwrap(); + let ca = StructChunked::from_series(name, size, fields.iter()).unwrap(); if !fields.is_empty() { ca.with_outer_validity(Some(Bitmap::new_zeroed(size))) diff --git a/crates/polars-core/src/utils/flatten.rs b/crates/polars-core/src/utils/flatten.rs index b96ce61dab82..733ffcd60c08 100644 --- a/crates/polars-core/src/utils/flatten.rs +++ b/crates/polars-core/src/utils/flatten.rs @@ -17,8 +17,10 @@ pub fn flatten_df_iter(df: &DataFrame) -> impl Iterator + '_ { out.set_sorted_flag(s.is_sorted_flag()); Column::from(out) }) - .collect(); - let df = unsafe { DataFrame::new_no_checks(columns) }; + .collect::>(); + + let height = DataFrame::infer_height(&columns); + let df = unsafe { DataFrame::new_no_checks(height, columns) }; if df.is_empty() { None } else { diff --git a/crates/polars-expr/src/expressions/aggregation.rs b/crates/polars-expr/src/expressions/aggregation.rs index e1d2a1e716ab..af5383e83c83 100644 --- a/crates/polars-expr/src/expressions/aggregation.rs +++ b/crates/polars-expr/src/expressions/aggregation.rs @@ -535,11 +535,13 @@ impl PartitionedAggregation for AggregationExpr { }; let mut count_s = series.agg_valid_count(groups); count_s.rename(PlSmallStr::from_static("__POLARS_COUNT")); - Ok( - StructChunked::from_series(new_name, [agg_s, count_s].iter()) - .unwrap() - .into_series(), + Ok(StructChunked::from_series( + new_name, + agg_s.len(), + [agg_s, count_s].iter(), ) + .unwrap() + .into_series()) } }, GroupByMethod::Implode => { diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs index b47d1744f662..46b20c9a34b3 100644 --- a/crates/polars-expr/src/expressions/window.rs +++ b/crates/polars-expr/src/expressions/window.rs @@ -588,8 +588,8 @@ impl PhysicalExpr for WindowExpr { .1, ) } else { - let df_right = unsafe { DataFrame::new_no_checks(keys) }; - let df_left = unsafe { DataFrame::new_no_checks(group_by_columns) }; + let df_right = unsafe { DataFrame::new_no_checks_height_from_first(keys) }; + let df_left = unsafe { DataFrame::new_no_checks_height_from_first(group_by_columns) }; Ok(private_left_join_multiple_keys(&df_left, &df_right, true)?.1) } }; diff --git a/crates/polars-expr/src/state/node_timer.rs b/crates/polars-expr/src/state/node_timer.rs index 48aa65e12c17..c3114d3029cd 100644 --- a/crates/polars-expr/src/state/node_timer.rs +++ b/crates/polars-expr/src/state/node_timer.rs @@ -57,8 +57,9 @@ impl NodeTimer { let mut end = end.into_inner(); end.rename(PlSmallStr::from_static("end")); + let height = nodes_s.len(); let columns = vec![nodes_s, start.into_column(), end.into_column()]; - let df = unsafe { DataFrame::new_no_checks(columns) }; + let df = unsafe { DataFrame::new_no_checks(height, columns) }; df.sort(vec!["start"], SortMultipleOptions::default()) } } diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index 670e46330161..a8c304b8f65c 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -82,7 +82,7 @@ pub(crate) fn cast_columns( }) .collect::>>() })?; - *df = unsafe { DataFrame::new_no_checks(cols) } + *df = unsafe { DataFrame::new_no_checks(df.height(), cols) } } else { // cast to the original dtypes in the schema for fld in to_cast { @@ -636,6 +636,6 @@ fn read_chunk( let columns = buffers .into_iter() .map(|buf| buf.into_series().map(Column::from)) - .collect::>()?; - Ok(unsafe { DataFrame::new_no_checks(columns) }) + .collect::>>()?; + Ok(unsafe { DataFrame::new_no_checks_height_from_first(columns) }) } diff --git a/crates/polars-io/src/csv/read/reader.rs b/crates/polars-io/src/csv/read/reader.rs index e31948c85d26..098f27d82c0d 100644 --- a/crates/polars-io/src/csv/read/reader.rs +++ b/crates/polars-io/src/csv/read/reader.rs @@ -318,30 +318,29 @@ where } #[cfg(feature = "temporal")] -fn parse_dates(mut df: DataFrame, fixed_schema: &Schema) -> DataFrame { +fn parse_dates(df: DataFrame, fixed_schema: &Schema) -> DataFrame { use polars_core::POOL; - let cols = unsafe { std::mem::take(df.get_columns_mut()) } - .into_par_iter() - .map(|c| { - match c.dtype() { - DataType::String => { - let ca = c.str().unwrap(); - // don't change columns that are in the fixed schema. - if fixed_schema.index_of(c.name()).is_some() { - return c; - } - - #[cfg(feature = "dtype-time")] - if let Ok(ca) = ca.as_time(None, false) { - return ca.into_column(); - } - c - }, - _ => c, - } - }); + let height = df.height(); + let cols = df.take_columns().into_par_iter().map(|c| { + match c.dtype() { + DataType::String => { + let ca = c.str().unwrap(); + // don't change columns that are in the fixed schema. + if fixed_schema.index_of(c.name()).is_some() { + return c; + } + + #[cfg(feature = "dtype-time")] + if let Ok(ca) = ca.as_time(None, false) { + return ca.into_column(); + } + c + }, + _ => c, + } + }); let cols = POOL.install(|| cols.collect::>()); - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks(height, cols) } } diff --git a/crates/polars-io/src/hive.rs b/crates/polars-io/src/hive.rs index 77e65647fa56..df755eab56f3 100644 --- a/crates/polars-io/src/hive.rs +++ b/crates/polars-io/src/hive.rs @@ -1,4 +1,6 @@ +use polars_core::frame::column::ScalarColumn; use polars_core::frame::DataFrame; +use polars_core::prelude::Column; use polars_core::series::Series; /// Materializes hive partitions. @@ -22,13 +24,17 @@ pub(crate) fn materialize_hive_partitions( return; } - let hive_columns_iter = hive_columns + let hive_columns_sc = hive_columns .iter() - .map(|s| s.new_from_index(0, num_rows).into()); + .map(|s| ScalarColumn::new(s.name().clone(), s.first(), num_rows).into()) + .collect::>(); if reader_schema.index_of(hive_columns[0].name()).is_none() || df.width() == 0 { // Fast-path - all hive columns are at the end - unsafe { df.get_columns_mut() }.extend(hive_columns_iter); + if df.width() == 0 { + unsafe { df.set_height(num_rows) }; + } + unsafe { df.hstack_mut_unchecked(&hive_columns_sc) }; return; } @@ -39,9 +45,8 @@ pub(crate) fn materialize_hive_partitions( // We have a slightly involved algorithm here because `reader_schema` may contain extra // columns that were excluded from a projection pushdown. - let hive_columns = hive_columns_iter.collect::>(); // Safety: These are both non-empty at the start - let mut series_arr = [df_columns, hive_columns.as_slice()]; + let mut series_arr = [df_columns, hive_columns_sc.as_slice()]; let mut schema_idx_arr = [ reader_schema.index_of(series_arr[0][0].name()).unwrap(), reader_schema.index_of(series_arr[1][0].name()).unwrap(), @@ -71,6 +76,6 @@ pub(crate) fn materialize_hive_partitions( out_columns.extend_from_slice(series_arr[0]); out_columns.extend_from_slice(series_arr[1]); - *unsafe { df.get_columns_mut() } = out_columns; + *df = unsafe { DataFrame::new_no_checks(num_rows, out_columns) }; } } diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index 45aa2260de30..0389a73b5081 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -345,7 +345,7 @@ fn rg_to_dfs_prefiltered( // Apply the predicate to the live columns and save the dataframe and the bitmask let md = &file_metadata.row_groups[rg_idx]; - let mut df = unsafe { DataFrame::new_no_checks(live_columns) }; + let mut df = unsafe { DataFrame::new_no_checks(md.num_rows(), live_columns) }; materialize_hive_partitions( &mut df, @@ -462,6 +462,7 @@ fn rg_to_dfs_prefiltered( debug_assert!(dead_columns.iter().all(|v| v.len() == df.height())); + let height = df.height(); let mut live_columns = df.take_columns(); assert_eq!( @@ -507,7 +508,7 @@ fn rg_to_dfs_prefiltered( // SAFETY: This is completely based on the schema so all column names are unique // and the length is given by the parquet file which should always be the same. - let df = unsafe { DataFrame::new_no_checks(columns) }; + let df = unsafe { DataFrame::new_no_checks(height, columns) }; PolarsResult::Ok(Some(df)) }) @@ -624,7 +625,7 @@ fn rg_to_dfs_optionally_par_over_columns( .collect::>>()? }; - let mut df = unsafe { DataFrame::new_no_checks(columns) }; + let mut df = unsafe { DataFrame::new_no_checks(rg_slice.1, columns) }; if let Some(rc) = &row_index { df.with_row_index_mut(rc.name.clone(), Some(*previous_row_count + rc.offset)); } @@ -730,7 +731,7 @@ fn rg_to_dfs_par_over_rg( }) .collect::>>()?; - let mut df = unsafe { DataFrame::new_no_checks(columns) }; + let mut df = unsafe { DataFrame::new_no_checks(slice.1, columns) }; if let Some(rc) = &row_index { df.with_row_index_mut( diff --git a/crates/polars-json/src/json/deserialize.rs b/crates/polars-json/src/json/deserialize.rs index eb4c12954a8d..600bd6dbea53 100644 --- a/crates/polars-json/src/json/deserialize.rs +++ b/crates/polars-json/src/json/deserialize.rs @@ -165,7 +165,7 @@ fn deserialize_struct<'a, A: Borrow>>( }) .collect::>(); - StructArray::new(dtype.clone(), values, validity.into()) + StructArray::new(dtype.clone(), rows.len(), values, validity.into()) } fn fill_array_from( diff --git a/crates/polars-lazy/src/dsl/eval.rs b/crates/polars-lazy/src/dsl/eval.rs index 92783aa874ee..460889ef01a6 100644 --- a/crates/polars-lazy/src/dsl/eval.rs +++ b/crates/polars-lazy/src/dsl/eval.rs @@ -99,9 +99,9 @@ pub trait ExprEvalExtension: IntoExpr + Sized { let c = c.slice(0, len); if (len - c.null_count()) >= min_periods { unsafe { - df_container.get_columns_mut().push(c.into_column()); + df_container.with_column_unchecked(c.into_column()); let out = phys_expr.evaluate(&df_container, &state)?.into_column(); - df_container.get_columns_mut().clear(); + df_container.clear_columns(); finish(out) } } else { diff --git a/crates/polars-lazy/src/dsl/list.rs b/crates/polars-lazy/src/dsl/list.rs index 4dae2529bc14..d73e4be5d13e 100644 --- a/crates/polars-lazy/src/dsl/list.rs +++ b/crates/polars-lazy/src/dsl/list.rs @@ -86,9 +86,9 @@ fn run_per_sublist( lst.into_iter() .map(|s| { s.and_then(|s| unsafe { - df_container.get_columns_mut().push(s.into_column()); + df_container.with_column_unchecked(s.into_column()); let out = phys_expr.evaluate(&df_container, &state); - df_container.get_columns_mut().clear(); + df_container.clear_columns(); match out { Ok(s) => Some(s), Err(e) => { diff --git a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs index 83c6ec2e5bda..c8f1726e57fd 100644 --- a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs +++ b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs @@ -156,7 +156,7 @@ fn estimate_unique_count(keys: &[Column], mut sample_size: usize) -> PolarsResul .map(|s| s.slice(offset, sample_size)) .map(Column::from) .collect::>(); - let df = unsafe { DataFrame::new_no_checks(keys) }; + let df = unsafe { DataFrame::new_no_checks(sample_size, keys) }; let names = df.get_column_names().into_iter().cloned(); let gb = df.group_by(names).unwrap(); Ok(finish(gb.get_groups())) diff --git a/crates/polars-mem-engine/src/executors/projection_utils.rs b/crates/polars-mem-engine/src/executors/projection_utils.rs index 477cdd79a162..47464849582e 100644 --- a/crates/polars-mem-engine/src/executors/projection_utils.rs +++ b/crates/polars-mem-engine/src/executors/projection_utils.rs @@ -340,9 +340,12 @@ pub(super) fn check_expand_literals( } // @scalar-opt - let selected_columns = selected_columns.into_iter().map(Column::from).collect(); + let selected_columns = selected_columns + .into_iter() + .map(Column::from) + .collect::>(); - let df = unsafe { DataFrame::new_no_checks(selected_columns) }; + let df = unsafe { DataFrame::new_no_checks_height_from_first(selected_columns) }; // a literal could be projected to a zero length dataframe. // This prevents a panic. diff --git a/crates/polars-mem-engine/src/executors/scan/python_scan.rs b/crates/polars-mem-engine/src/executors/scan/python_scan.rs index 067895ed593f..f74da737d0ac 100644 --- a/crates/polars-mem-engine/src/executors/scan/python_scan.rs +++ b/crates/polars-mem-engine/src/executors/scan/python_scan.rs @@ -23,11 +23,9 @@ fn python_df_to_rust(py: Python, df: Bound) -> PolarsResult { let (ptr, len, cap) = raw_parts; unsafe { - Ok(DataFrame::new_no_checks(Vec::from_raw_parts( - ptr as *mut Column, - len, - cap, - ))) + Ok(DataFrame::new_no_checks_height_from_first( + Vec::from_raw_parts(ptr as *mut Column, len, cap), + )) } } diff --git a/crates/polars-mem-engine/src/executors/stack.rs b/crates/polars-mem-engine/src/executors/stack.rs index a5bb2f78ad89..e325e2982a00 100644 --- a/crates/polars-mem-engine/src/executors/stack.rs +++ b/crates/polars-mem-engine/src/executors/stack.rs @@ -64,7 +64,7 @@ impl StackExec { // new, unique column names. It is immediately // followed by a projection which pulls out the // possibly mismatching column lengths. - unsafe { df.get_columns_mut() }.extend(res.into_iter().map(Column::from)); + unsafe { df.column_extend_unchecked(res.into_iter().map(Column::from)) }; } else { let (df_height, df_width) = df.shape(); diff --git a/crates/polars-ops/src/chunked_array/array/to_struct.rs b/crates/polars-ops/src/chunked_array/array/to_struct.rs index b00dbbf4d43b..d07350816711 100644 --- a/crates/polars-ops/src/chunked_array/array/to_struct.rs +++ b/crates/polars-ops/src/chunked_array/array/to_struct.rs @@ -40,7 +40,7 @@ pub trait ToStruct: AsArray { .collect::>>() })?; - StructChunked::from_series(ca.name().clone(), fields.iter()) + StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter()) } } diff --git a/crates/polars-ops/src/chunked_array/gather/chunked.rs b/crates/polars-ops/src/chunked_array/gather/chunked.rs index 345f3689984c..b31c77e8e365 100644 --- a/crates/polars-ops/src/chunked_array/gather/chunked.rs +++ b/crates/polars-ops/src/chunked_array/gather/chunked.rs @@ -24,7 +24,7 @@ pub trait DfTake: IntoDf { .to_df() ._apply_columns(&|s| s.take_chunked_unchecked(idx, sorted)); - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks_height_from_first(cols) } } /// Take elements by a slice of optional [`ChunkId`]s. /// @@ -35,7 +35,7 @@ pub trait DfTake: IntoDf { .to_df() ._apply_columns(&|s| s.take_opt_chunked_unchecked(idx)); - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks_height_from_first(cols) } } /// # Safety @@ -45,7 +45,7 @@ pub trait DfTake: IntoDf { .to_df() ._apply_columns_par(&|s| s.take_chunked_unchecked(idx, sorted)); - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks_height_from_first(cols) } } /// # Safety @@ -57,7 +57,7 @@ pub trait DfTake: IntoDf { .to_df() ._apply_columns_par(&|s| s.take_opt_chunked_unchecked(idx)); - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks_height_from_first(cols) } } } diff --git a/crates/polars-ops/src/chunked_array/hist.rs b/crates/polars-ops/src/chunked_array/hist.rs index ca906d12851c..dbf8337ae637 100644 --- a/crates/polars-ops/src/chunked_array/hist.rs +++ b/crates/polars-ops/src/chunked_array/hist.rs @@ -136,7 +136,7 @@ where let out = fields.pop().unwrap(); out.with_name(ca.name().clone()) } else { - StructChunked::from_series(ca.name().clone(), fields.iter()) + StructChunked::from_series(ca.name().clone(), fields[0].len(), fields.iter()) .unwrap() .into_series() } diff --git a/crates/polars-ops/src/chunked_array/list/to_struct.rs b/crates/polars-ops/src/chunked_array/list/to_struct.rs index cdd245bce8b7..a3bd6390b1a3 100644 --- a/crates/polars-ops/src/chunked_array/list/to_struct.rs +++ b/crates/polars-ops/src/chunked_array/list/to_struct.rs @@ -80,7 +80,7 @@ pub trait ToStruct: AsList { .collect::>>() })?; - StructChunked::from_series(ca.name().clone(), fields.iter()) + StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter()) } } diff --git a/crates/polars-ops/src/chunked_array/strings/extract.rs b/crates/polars-ops/src/chunked_array/strings/extract.rs index cb26d66f7aff..e7068e96b614 100644 --- a/crates/polars-ops/src/chunked_array/strings/extract.rs +++ b/crates/polars-ops/src/chunked_array/strings/extract.rs @@ -36,7 +36,7 @@ fn extract_groups_array( } let values = builders.into_iter().map(|a| a.freeze().boxed()).collect(); - Ok(StructArray::new(dtype.clone(), values, arr.validity().cloned()).boxed()) + Ok(StructArray::new(dtype.clone(), arr.len(), values, arr.validity().cloned()).boxed()) } #[cfg(feature = "extract_groups")] @@ -50,6 +50,7 @@ pub(super) fn extract_groups( if n_fields == 1 { return StructChunked::from_series( ca.name().clone(), + ca.len(), [Series::new_null(ca.name().clone(), ca.len())].iter(), ) .map(|ca| ca.into_series()); diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs index fe8783530d6a..02b3c076efd7 100644 --- a/crates/polars-ops/src/chunked_array/strings/json_path.rs +++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs @@ -204,6 +204,7 @@ mod tests { let expected_series = StructChunked::from_series( "".into(), + 4, [ Series::new("a".into(), &[None, Some(1), Some(2), None]), Series::new("b".into(), &[None, Some("hello"), Some("goodbye"), None]), diff --git a/crates/polars-ops/src/chunked_array/strings/split.rs b/crates/polars-ops/src/chunked_array/strings/split.rs index 31c15a70cb08..98faaad75ed0 100644 --- a/crates/polars-ops/src/chunked_array/strings/split.rs +++ b/crates/polars-ops/src/chunked_array/strings/split.rs @@ -141,6 +141,7 @@ where }) } + let len = arrs[0].len(); let fields = arrs .into_iter() .enumerate() @@ -149,7 +150,8 @@ where }) .collect::>(); - StructChunked::from_series(ca.name().clone(), fields.iter()) + dbg!("TODO: Properly implement"); + StructChunked::from_series(ca.name().clone(), len, fields.iter()) } pub fn split_helper<'a, F, I>(ca: &'a StringChunked, by: &'a StringChunked, op: F) -> ListChunked diff --git a/crates/polars-ops/src/frame/join/asof/groups.rs b/crates/polars-ops/src/frame/join/asof/groups.rs index 3f2ef6a63ca8..e4f2e7e8fa62 100644 --- a/crates/polars-ops/src/frame/join/asof/groups.rs +++ b/crates/polars-ops/src/frame/join/asof/groups.rs @@ -516,7 +516,7 @@ pub trait AsofJoinBy: IntoDf { .filter(|s| !drop_these.contains(&s.name())) .cloned() .collect(); - let proj_other_df = unsafe { DataFrame::new_no_checks(cols) }; + let proj_other_df = unsafe { DataFrame::new_no_checks(other_df.height(), cols) }; let left = self_df.clone(); diff --git a/crates/polars-ops/src/frame/join/merge_sorted.rs b/crates/polars-ops/src/frame/join/merge_sorted.rs index a180b293ca0f..d018c0e7f756 100644 --- a/crates/polars-ops/src/frame/join/merge_sorted.rs +++ b/crates/polars-ops/src/frame/join/merge_sorted.rs @@ -47,7 +47,7 @@ pub fn _merge_sorted_dfs( }) .collect::>()?; - Ok(unsafe { DataFrame::new_no_checks(new_columns) }) + Ok(unsafe { DataFrame::new_no_checks(left.height() + right.height(), new_columns) }) } fn merge_series(lhs: &Series, rhs: &Series, merge_indicator: &[bool]) -> PolarsResult { @@ -85,7 +85,7 @@ fn merge_series(lhs: &Series, rhs: &Series, merge_indicator: &[bool]) -> PolarsR .zip(rhs.fields_as_series()) .map(|(lhs, rhs)| merge_series(lhs, &rhs, merge_indicator)) .collect::>>()?; - StructChunked::from_series(PlSmallStr::EMPTY, new_fields.iter()) + StructChunked::from_series(PlSmallStr::EMPTY, new_fields[0].len(), new_fields.iter()) .unwrap() .into_series() }, diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index 5e9ece7a3878..0c5478e7b8b6 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -233,7 +233,7 @@ fn pivot_impl( already exists in the DataFrame. Please rename it prior to calling `pivot`.") } // @scalar-opt - let columns_struct = StructChunked::from_columns(column.clone(), fields) + let columns_struct = StructChunked::from_columns(column.clone(), fields[0].len(), fields) .unwrap() .into_column(); let mut binding = pivot_df.clone(); diff --git a/crates/polars-ops/src/frame/pivot/positioning.rs b/crates/polars-ops/src/frame/pivot/positioning.rs index 0e0de1083c5b..f91d537fb9d4 100644 --- a/crates/polars-ops/src/frame/pivot/positioning.rs +++ b/crates/polars-ops/src/frame/pivot/positioning.rs @@ -485,9 +485,12 @@ pub(super) fn compute_row_idx( } else { let binding = pivot_df.select(index.iter().cloned())?; let fields = binding.get_columns(); - let index_struct_series = - StructChunked::from_columns(PlSmallStr::from_static("placeholder"), fields)? - .into_series(); + let index_struct_series = StructChunked::from_columns( + PlSmallStr::from_static("placeholder"), + fields[0].len(), + fields, + )? + .into_series(); let index_agg = unsafe { index_struct_series.agg_first(groups) }; let index_agg_physical = index_agg.to_physical_repr(); let ca = index_agg_physical.struct_()?; diff --git a/crates/polars-ops/src/frame/pivot/unpivot.rs b/crates/polars-ops/src/frame/pivot/unpivot.rs index 89c38e88c37b..7715659b3c61 100644 --- a/crates/polars-ops/src/frame/pivot/unpivot.rs +++ b/crates/polars-ops/src/frame/pivot/unpivot.rs @@ -115,7 +115,7 @@ pub trait UnpivotDF: IntoDf { out.push(variable_col); out.push(value_col); - return Ok(unsafe { DataFrame::new_no_checks(out) }); + return Ok(unsafe { DataFrame::new_no_checks(0, out) }); } let index_set = PlHashSet::from_iter(index.iter().cloned()); diff --git a/crates/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs index 52cc2ee5a67a..c78aa67efca8 100644 --- a/crates/polars-ops/src/series/ops/cut.rs +++ b/crates/polars-ops/src/series/ops/cut.rs @@ -63,7 +63,7 @@ fn map_cats( ._with_fast_unique(label_has_value.iter().all(bool::clone)) .into_series(), ]; - Ok(StructChunked::from_series(out_name, outvals.iter())?.into_series()) + Ok(StructChunked::from_series(out_name, outvals[0].len(), outvals.iter())?.into_series()) } else { Ok(bld .drain_iter_and_finish(s_iter.map(|opt| { diff --git a/crates/polars-ops/src/series/ops/horizontal.rs b/crates/polars-ops/src/series/ops/horizontal.rs index 6e07e212bedd..663ac3664c8e 100644 --- a/crates/polars-ops/src/series/ops/horizontal.rs +++ b/crates/polars-ops/src/series/ops/horizontal.rs @@ -2,28 +2,32 @@ use polars_core::frame::NullStrategy; use polars_core::prelude::*; pub fn max_horizontal(s: &[Column]) -> PolarsResult> { - let df = unsafe { DataFrame::_new_no_checks_impl(Vec::from(s)) }; + let df = + unsafe { DataFrame::_new_no_checks_impl(s.first().map_or(0, Column::len), Vec::from(s)) }; df.max_horizontal() .map(|s| s.map(Column::from)) .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } pub fn min_horizontal(s: &[Column]) -> PolarsResult> { - let df = unsafe { DataFrame::_new_no_checks_impl(Vec::from(s)) }; + let df = + unsafe { DataFrame::_new_no_checks_impl(s.first().map_or(0, Column::len), Vec::from(s)) }; df.min_horizontal() .map(|s| s.map(Column::from)) .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } pub fn sum_horizontal(s: &[Column]) -> PolarsResult> { - let df = unsafe { DataFrame::_new_no_checks_impl(Vec::from(s)) }; + let df = + unsafe { DataFrame::_new_no_checks_impl(s.first().map_or(0, Column::len), Vec::from(s)) }; df.sum_horizontal(NullStrategy::Ignore) .map(|s| s.map(Column::from)) .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } pub fn mean_horizontal(s: &[Column]) -> PolarsResult> { - let df = unsafe { DataFrame::_new_no_checks_impl(Vec::from(s)) }; + let df = + unsafe { DataFrame::_new_no_checks_impl(s.first().map_or(0, Column::len), Vec::from(s)) }; df.mean_horizontal(NullStrategy::Ignore) .map(|s| s.map(Column::from)) .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) diff --git a/crates/polars-ops/src/series/ops/replace.rs b/crates/polars-ops/src/series/ops/replace.rs index f2d8f8128777..7c5697429372 100644 --- a/crates/polars-ops/src/series/ops/replace.rs +++ b/crates/polars-ops/src/series/ops/replace.rs @@ -237,6 +237,7 @@ fn create_replacer(mut old: Series, mut new: Series, add_mask: bool) -> PolarsRe old.rename(PlSmallStr::from_static("__POLARS_REPLACE_OLD")); new.rename(PlSmallStr::from_static("__POLARS_REPLACE_NEW")); + let len = old.len(); let cols = if add_mask { // @scalar-opt let mask = Column::new(PlSmallStr::from_static("__POLARS_REPLACE_MASK"), &[true]) @@ -245,7 +246,7 @@ fn create_replacer(mut old: Series, mut new: Series, add_mask: bool) -> PolarsRe } else { vec![old.into(), new.into()] }; - let out = unsafe { DataFrame::new_no_checks(cols) }; + let out = unsafe { DataFrame::new_no_checks(len, cols) }; Ok(out) } diff --git a/crates/polars-ops/src/series/ops/rle.rs b/crates/polars-ops/src/series/ops/rle.rs index 9277913558a2..ed88648fee72 100644 --- a/crates/polars-ops/src/series/ops/rle.rs +++ b/crates/polars-ops/src/series/ops/rle.rs @@ -31,7 +31,7 @@ pub fn rle(s: &Column) -> PolarsResult { Series::from_vec(PlSmallStr::from_static("len"), lengths).into(), vals.to_owned(), ]; - Ok(StructChunked::from_columns(s.name().clone(), &outvals)?.into_column()) + Ok(StructChunked::from_columns(s.name().clone(), vals.len(), &outvals)?.into_column()) } /// Similar to `rle`, but maps values to run IDs. diff --git a/crates/polars-ops/src/series/ops/to_dummies.rs b/crates/polars-ops/src/series/ops/to_dummies.rs index 437f49dad480..ab5015f97864 100644 --- a/crates/polars-ops/src/series/ops/to_dummies.rs +++ b/crates/polars-ops/src/series/ops/to_dummies.rs @@ -44,9 +44,13 @@ impl ToDummies for Series { }; ca.into_column() }) - .collect(); + .collect::>(); - Ok(unsafe { DataFrame::new_no_checks(sort_columns(columns)) }) + Ok(unsafe { + DataFrame::new_no_checks_height_from_first( + sort_columns(columns), + ) + }) } } diff --git a/crates/polars-ops/src/series/ops/various.rs b/crates/polars-ops/src/series/ops/various.rs index c29fcc431c98..e49e24ddf7a8 100644 --- a/crates/polars-ops/src/series/ops/various.rs +++ b/crates/polars-ops/src/series/ops/various.rs @@ -39,8 +39,9 @@ pub trait SeriesMethods: SeriesSealed { counts.into_column() }; + let height = counts.len(); let cols = vec![values, counts]; - let df = unsafe { DataFrame::new_no_checks(cols) }; + let df = unsafe { DataFrame::new_no_checks(height, cols) }; if sort { df.sort( [name], diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs index b5b083f8b882..415517207a87 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs @@ -403,7 +403,7 @@ pub fn columns_to_iter_recursive( let (mut nested, last_array) = field_to_nested_array(init.clone(), &mut columns, &mut types, last_field)?; debug_assert!(matches!(nested.last().unwrap(), NestedContent::Struct)); - let (_, _, struct_validity) = nested.pop().unwrap(); + let (length, _, struct_validity) = nested.pop().unwrap(); let mut field_arrays = Vec::>::with_capacity(fields.len()); field_arrays.push(last_array); @@ -431,6 +431,7 @@ pub fn columns_to_iter_recursive( nested, Box::new(StructArray::new( ArrowDataType::Struct(fields.clone()), + length, field_arrays, struct_validity, )), diff --git a/crates/polars-parquet/src/arrow/read/statistics/mod.rs b/crates/polars-parquet/src/arrow/read/statistics/mod.rs index cb65827de9a6..d9210abcb5aa 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/mod.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/mod.rs @@ -319,7 +319,11 @@ fn push( null_count, ); }, - Struct(_) => { + Struct(fields) => { + if fields.is_empty() { + return Ok(()); + } + let min = min .as_mut_any() .downcast_mut::() @@ -338,11 +342,11 @@ fn push( .unwrap(); return min - .inner + .inner_mut() .iter_mut() - .zip(max.inner.iter_mut()) - .zip(distinct_count.inner.iter_mut()) - .zip(null_count.inner.iter_mut()) + .zip(max.inner_mut()) + .zip(distinct_count.inner_mut()) + .zip(null_count.inner_mut()) .try_for_each(|(((min, max), distinct_count), null_count)| { push( stats, diff --git a/crates/polars-parquet/src/arrow/read/statistics/struct_.rs b/crates/polars-parquet/src/arrow/read/statistics/struct_.rs index bb6889d569f8..8adfaed734c4 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/struct_.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/struct_.rs @@ -7,7 +7,7 @@ use super::make_mutable; #[derive(Debug)] pub struct DynMutableStructArray { dtype: ArrowDataType, - pub inner: Vec>, + inner: Vec>, } impl DynMutableStructArray { @@ -16,6 +16,9 @@ impl DynMutableStructArray { ArrowDataType::Struct(inner) => inner, _ => unreachable!(), }; + + assert!(!inners.is_empty()); + let inner = inners .iter() .map(|f| make_mutable(f.dtype(), capacity)) @@ -23,7 +26,12 @@ impl DynMutableStructArray { Ok(Self { dtype, inner }) } + + pub fn inner_mut(&mut self) -> &mut [Box] { + &mut self.inner + } } + impl MutableArray for DynMutableStructArray { fn dtype(&self) -> &ArrowDataType { &self.dtype @@ -38,9 +46,9 @@ impl MutableArray for DynMutableStructArray { } fn as_box(&mut self) -> Box { + let len = self.len(); let inner = self.inner.iter_mut().map(|x| x.as_box()).collect(); - - Box::new(StructArray::new(self.dtype.clone(), inner, None)) + Box::new(StructArray::new(self.dtype.clone(), len, inner, None)) } fn as_any(&self) -> &dyn std::any::Any { diff --git a/crates/polars-parquet/src/arrow/write/pages.rs b/crates/polars-parquet/src/arrow/write/pages.rs index ce2cb0c1cc29..f943ebfbc671 100644 --- a/crates/polars-parquet/src/arrow/write/pages.rs +++ b/crates/polars-parquet/src/arrow/write/pages.rs @@ -600,6 +600,7 @@ mod tests { let array = StructArray::new( ArrowDataType::Struct(fields), + 4, vec![boolean.clone(), int.clone()], Some(Bitmap::from([true, true, false, true])), ); @@ -664,6 +665,7 @@ mod tests { let array = StructArray::new( ArrowDataType::Struct(fields), + 4, vec![boolean.clone(), int.clone()], Some(Bitmap::from([true, true, false, true])), ); @@ -675,6 +677,7 @@ mod tests { let array = StructArray::new( ArrowDataType::Struct(fields), + 4, vec![Box::new(array.clone()), Box::new(array)], None, ); @@ -767,6 +770,7 @@ mod tests { let array = StructArray::new( ArrowDataType::Struct(fields), + 4, vec![boolean.clone(), int.clone()], Some(Bitmap::from([true, true, false, true])), ); @@ -872,7 +876,7 @@ mod tests { let key_array = Utf8Array::::from_slice(["k1", "k2", "k3", "k4", "k5", "k6"]).boxed(); let val_array = Int32Array::from_slice([42, 28, 19, 31, 21, 17]).boxed(); - let kv_array = StructArray::try_new(kv_type, vec![key_array, val_array], None) + let kv_array = StructArray::try_new(kv_type, 6, vec![key_array, val_array], None) .unwrap() .boxed(); let offsets = OffsetsBuffer::try_from(vec![0, 2, 3, 4, 6]).unwrap(); diff --git a/crates/polars-pipe/src/executors/operators/projection.rs b/crates/polars-pipe/src/executors/operators/projection.rs index 9ae6dbc5299d..de8c9d424b44 100644 --- a/crates/polars-pipe/src/executors/operators/projection.rs +++ b/crates/polars-pipe/src/executors/operators/projection.rs @@ -89,7 +89,8 @@ impl Operator for ProjectionOperator { } } - let chunk = chunk.with_data(unsafe { DataFrame::new_no_checks(projected) }); + let chunk = + chunk.with_data(unsafe { DataFrame::new_no_checks_height_from_first(projected) }); Ok(OperatorResult::Finished(chunk)) } fn split(&self, _thread_no: usize) -> Box { @@ -125,7 +126,7 @@ impl Operator for HstackOperator { .collect::>>()?; let columns = chunk.data.get_columns()[..width].to_vec(); - let mut df = unsafe { DataFrame::new_no_checks(columns) }; + let mut df = unsafe { DataFrame::new_no_checks_height_from_first(columns) }; let schema = &*self.input_schema; if self.options.should_broadcast { diff --git a/crates/polars-pipe/src/executors/operators/reproject.rs b/crates/polars-pipe/src/executors/operators/reproject.rs index a4f6010bef79..d037937896d3 100644 --- a/crates/polars-pipe/src/executors/operators/reproject.rs +++ b/crates/polars-pipe/src/executors/operators/reproject.rs @@ -27,7 +27,7 @@ pub(crate) fn reproject_chunk( } else { let columns = chunk.data.get_columns(); let cols = positions.iter().map(|i| columns[*i].clone()).collect(); - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks(chunk.data.height(), cols) } }; *chunk = chunk.with_data(out); Ok(()) diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs index 05947baae209..3162c57828b3 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs @@ -279,7 +279,7 @@ impl AggHashTable { .map(|buf| buf.into_series().into_column()), ); physical_agg_to_logical(&mut cols, &self.output_schema); - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks_height_from_first(cols) } } } diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs index e9fa7ba495cd..98425f66bee0 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs @@ -74,6 +74,8 @@ impl SpillPayload { debug_assert_eq!(self.hashes.len(), self.chunk_idx.len()); debug_assert_eq!(self.hashes.len(), self.keys.len()); + let height = self.hashes.len(); + let hashes = UInt64Chunked::from_vec(PlSmallStr::from_static(HASH_COL), self.hashes).into_column(); let chunk_idx = @@ -87,7 +89,7 @@ impl SpillPayload { cols.push(keys); // @scalar-opt cols.extend(self.aggs.into_iter().map(Column::from)); - unsafe { DataFrame::new_no_checks(cols) } + unsafe { DataFrame::new_no_checks(height, cols) } } fn spilled_to_columns( diff --git a/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs index 6199789336e3..dce130c29187 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs @@ -212,7 +212,7 @@ where .map(|buf| buf.into_series().into_column()), ); physical_agg_to_logical(&mut cols, &self.output_schema); - Some(unsafe { DataFrame::new_no_checks(cols) }) + Some(unsafe { DataFrame::new_no_checks_height_from_first(cols) }) }) .collect::>(); Ok(dfs) diff --git a/crates/polars-pipe/src/executors/sinks/group_by/string.rs b/crates/polars-pipe/src/executors/sinks/group_by/string.rs index 0855e4cbf42d..a6254ba7fdaf 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/string.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/string.rs @@ -216,7 +216,7 @@ impl StringGroupbySink { .map(|buf| buf.into_series().into_column()), ); physical_agg_to_logical(&mut cols, &self.output_schema); - Some(unsafe { DataFrame::new_no_checks(cols) }) + Some(unsafe { DataFrame::new_no_checks_height_from_first(cols) }) }) .collect::>(); diff --git a/crates/polars-pipe/src/executors/sinks/group_by/utils.rs b/crates/polars-pipe/src/executors/sinks/group_by/utils.rs index dd0d8ce4aea0..ab9247e7a17b 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/utils.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/utils.rs @@ -59,9 +59,9 @@ pub(super) fn finalize_group_by( let df = if dfs.is_empty() { DataFrame::empty_with_schema(output_schema) } else { - let mut df = accumulate_dataframes_vertical_unchecked(dfs); + let df = accumulate_dataframes_vertical_unchecked(dfs); // re init to check duplicates - unsafe { DataFrame::new(std::mem::take(df.get_columns_mut())) }? + DataFrame::new(df.take_columns())? }; match ooc_payload { diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs index 2ab417ad2096..0b7cbfb2c534 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs @@ -121,6 +121,9 @@ impl GenericFullOuterJoinProbe { left_df .get_columns_mut() .extend_from_slice(right_df.get_columns()); + + // @TODO: Is this actually the case? + // SAFETY: output_names should be unique. left_df .get_columns_mut() .iter_mut() @@ -265,6 +268,7 @@ impl GenericFullOuterJoinProbe { let right_df = unsafe { DataFrame::new_no_checks( + size, right_df .get_columns() .iter() diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs index 7d32d10961c3..9cd85697ebaa 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs @@ -65,55 +65,56 @@ fn finalize_dataframe( sort_fields: &[EncodingField], schema: &Schema, ) { - unsafe { - let cols = df.get_columns_mut(); - // pop the encoded sort column - let encoded = cols.pop().unwrap(); - - // we decode the row-encoded binary column - // this will be decoded into multiple columns - // these are the columns we sorted by - // those need to be inserted at the `sort_idx` position - // in the `DataFrame`. - if can_decode { - let sort_dtypes = sort_dtypes.expect("should be set if 'can_decode'"); - - let encoded = encoded.binary_offset().unwrap(); - assert_eq!(encoded.chunks().len(), 1); - let arr = encoded.downcast_iter().next().unwrap(); - - // SAFETY: - // temporary extend lifetime - // this is safe as the lifetime in rows stays bound to this scope - let arrays = { - let arr = - std::mem::transmute::<&'_ BinaryArray, &'static BinaryArray>(arr); - decode_rows_from_binary(arr, sort_fields, sort_dtypes, rows) - }; - rows.clear(); - - let arrays = sort_by_idx(&arrays, sort_idx); - let mut sort_idx = sort_idx.to_vec(); - sort_idx.sort_unstable(); - - for (sort_idx, arr) in sort_idx.into_iter().zip(arrays) { - let (name, logical_dtype) = schema.get_at_index(sort_idx).unwrap(); - assert_eq!(logical_dtype.to_physical(), DataType::from(arr.dtype())); - let col = - Series::from_chunks_and_dtype_unchecked(name.clone(), vec![arr], logical_dtype) - .into_column(); - cols.insert(sort_idx, col); + // pop the encoded sort column + // SAFETY: We only pop a value + let encoded = unsafe { df.get_columns_mut() }.pop().unwrap(); + + // we decode the row-encoded binary column + // this will be decoded into multiple columns + // these are the columns we sorted by + // those need to be inserted at the `sort_idx` position + // in the `DataFrame`. + if can_decode { + let sort_dtypes = sort_dtypes.expect("should be set if 'can_decode'"); + + let encoded = encoded.binary_offset().unwrap(); + assert_eq!(encoded.chunks().len(), 1); + let arr = encoded.downcast_iter().next().unwrap(); + + // SAFETY: + // temporary extend lifetime + // this is safe as the lifetime in rows stays bound to this scope + let arrays = unsafe { + let arr = std::mem::transmute::<&'_ BinaryArray, &'static BinaryArray>(arr); + decode_rows_from_binary(arr, sort_fields, sort_dtypes, rows) + }; + rows.clear(); + + let arrays = sort_by_idx(&arrays, sort_idx); + let mut sort_idx = sort_idx.to_vec(); + sort_idx.sort_unstable(); + + for (sort_idx, arr) in sort_idx.into_iter().zip(arrays) { + let (name, logical_dtype) = schema.get_at_index(sort_idx).unwrap(); + assert_eq!(logical_dtype.to_physical(), DataType::from(arr.dtype())); + let col = unsafe { + Series::from_chunks_and_dtype_unchecked(name.clone(), vec![arr], logical_dtype) } - } + .into_column(); - let first_sort_col = &mut cols[sort_idx[0]]; - let flag = if sort_options.descending[0] { - IsSorted::Descending - } else { - IsSorted::Ascending - }; - first_sort_col.set_sorted_flag(flag) + // SAFETY: col has the same length as the df height because it was popped from df. + unsafe { df.get_columns_mut() }.insert(sort_idx, col); + } } + + // SAFETY: We just change the sorted flag. + let first_sort_col = &mut unsafe { df.get_columns_mut() }[sort_idx[0]]; + let flag = if sort_options.descending[0] { + IsSorted::Descending + } else { + IsSorted::Ascending + }; + first_sort_col.set_sorted_flag(flag) } /// This struct will dispatch all sorting to `SortSink` @@ -200,12 +201,11 @@ impl SortSinkMultiple { fn encode(&mut self, chunk: &mut DataChunk) -> PolarsResult<()> { let df = &mut chunk.data; - let cols = unsafe { df.get_columns_mut() }; self.sort_column.clear(); for i in self.sort_idx.iter() { - let s = &cols[*i]; + let s = &df.get_columns()[*i]; let arr = _get_rows_encoded_compat_array(s.as_materialized_series())?; self.sort_column.push(arr); } @@ -216,6 +216,9 @@ impl SortSinkMultiple { let mut sorted_sort_idx = self.sort_idx.to_vec(); sorted_sort_idx.sort_unstable(); + // SAFETY: We do not adjust the names or lengths or columns. + let cols = unsafe { df.get_columns_mut() }; + sorted_sort_idx .into_iter() .enumerate() diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 323776deb976..cdb80970e253 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -218,6 +218,10 @@ impl Source for CsvSource { for data_chunk in &mut out { // The batched reader creates the column containing all nulls because the schema it // gets passed contains the column. + // + // SAFETY: Columns are only replaced with columns + // 1. of the same name, and + // 2. of the same length. for s in unsafe { data_chunk.data.get_columns_mut() } { if s.name() == ca.name() { *s = ca.slice(0, s.len()).into_column(); diff --git a/crates/polars-plan/src/dsl/function_expr/coerce.rs b/crates/polars-plan/src/dsl/function_expr/coerce.rs index bd03ede32c84..2ade8737d077 100644 --- a/crates/polars-plan/src/dsl/function_expr/coerce.rs +++ b/crates/polars-plan/src/dsl/function_expr/coerce.rs @@ -1,5 +1,22 @@ use polars_core::prelude::*; -pub fn as_struct(s: &[Column]) -> PolarsResult { - Ok(StructChunked::from_columns(s[0].name().clone(), s)?.into_column()) +pub fn as_struct(cols: &[Column]) -> PolarsResult { + let Some(fst) = cols.first() else { + polars_bail!(nyi = "turning no columns as_struct"); + }; + + let mut min_length = usize::MAX; + let mut max_length = usize::MIN; + + for col in cols { + let len = col.len(); + + min_length = min_length.min(len); + max_length = max_length.max(len); + } + + // @NOTE: Any additional errors should be handled by the StructChunked::from_columns + let length = if min_length == 0 { 0 } else { max_length }; + + Ok(StructChunked::from_columns(fst.name().clone(), length, cols)?.into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/struct_.rs b/crates/polars-plan/src/dsl/function_expr/struct_.rs index acc8020b8e7e..23c8c961f9b2 100644 --- a/crates/polars-plan/src/dsl/function_expr/struct_.rs +++ b/crates/polars-plan/src/dsl/function_expr/struct_.rs @@ -176,7 +176,7 @@ pub(super) fn rename_fields(s: &Column, names: Arc<[PlSmallStr]>) -> PolarsResul s }) .collect::>(); - let mut out = StructChunked::from_series(ca.name().clone(), fields.iter())?; + let mut out = StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter())?; out.zip_outer_validity(ca); Ok(out.into_column()) } @@ -193,7 +193,7 @@ pub(super) fn prefix_fields(s: &Column, prefix: &str) -> PolarsResult { s }) .collect::>(); - let mut out = StructChunked::from_series(ca.name().clone(), fields.iter())?; + let mut out = StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter())?; out.zip_outer_validity(ca); Ok(out.into_column()) } @@ -210,7 +210,7 @@ pub(super) fn suffix_fields(s: &Column, suffix: &str) -> PolarsResult { s }) .collect::>(); - let mut out = StructChunked::from_series(ca.name().clone(), fields.iter())?; + let mut out = StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter())?; out.zip_outer_validity(ca); Ok(out.into_column()) } @@ -245,7 +245,8 @@ pub(super) fn with_fields(args: &[Column]) -> PolarsResult { } let new_fields = fields.into_values().cloned().collect::>(); - let mut out = StructChunked::from_series(ca.name().clone(), new_fields.iter())?; + let mut out = + StructChunked::from_series(ca.name().clone(), new_fields[0].len(), new_fields.iter())?; out.zip_outer_validity(ca); Ok(out.into_column()) } diff --git a/crates/polars-plan/src/dsl/functions/horizontal.rs b/crates/polars-plan/src/dsl/functions/horizontal.rs index 542212f8de82..41b9d79be0a4 100644 --- a/crates/polars-plan/src/dsl/functions/horizontal.rs +++ b/crates/polars-plan/src/dsl/functions/horizontal.rs @@ -126,7 +126,7 @@ where result.push(acc.clone()); } - StructChunked::from_columns(acc.name().clone(), &result) + StructChunked::from_columns(acc.name().clone(), result[0].len(), &result) .map(|ca| Some(ca.into_column())) }, None => Err(polars_err!(ComputeError: "`reduce` did not have any expressions to fold")), @@ -176,7 +176,8 @@ where } } - StructChunked::from_columns(acc.name().clone(), &result).map(|ca| Some(ca.into_column())) + StructChunked::from_columns(acc.name().clone(), result[0].len(), &result) + .map(|ca| Some(ca.into_column())) }); Expr::AnonymousFunction { diff --git a/crates/polars-plan/src/dsl/name.rs b/crates/polars-plan/src/dsl/name.rs index 1261b4430bec..5eeb67a77ba1 100644 --- a/crates/polars-plan/src/dsl/name.rs +++ b/crates/polars-plan/src/dsl/name.rs @@ -76,7 +76,7 @@ impl ExprNameNameSpace { fd }) .collect::>(); - let mut out = StructChunked::from_series(s.name().clone(), fields.iter())?; + let mut out = StructChunked::from_series(s.name().clone(), s.len(), fields.iter())?; out.zip_outer_validity(s); Ok(Some(out.into_column())) }, diff --git a/crates/polars-plan/src/plans/functions/merge_sorted.rs b/crates/polars-plan/src/plans/functions/merge_sorted.rs index 605a628c3c88..de170583e2d9 100644 --- a/crates/polars-plan/src/plans/functions/merge_sorted.rs +++ b/crates/polars-plan/src/plans/functions/merge_sorted.rs @@ -31,8 +31,10 @@ pub(super) fn merge_sorted(df: &DataFrame, column: &str) -> PolarsResult (usize, usize, usize) { // Used for polars-lazy python node. This takes the dataframe from // underneath of you, so don't use this anywhere else. - let mut df = std::mem::take(&mut self.df); - let cols = unsafe { std::mem::take(df.get_columns_mut()) }; + let df = std::mem::take(&mut self.df); + let cols = df.take_columns(); let mut md_cols = ManuallyDrop::new(cols); let ptr = md_cols.as_mut_ptr(); let len = md_cols.len(); diff --git a/crates/polars-python/src/interop/arrow/to_py.rs b/crates/polars-python/src/interop/arrow/to_py.rs index 017771bb1567..1a90e9eb680a 100644 --- a/crates/polars-python/src/interop/arrow/to_py.rs +++ b/crates/polars-python/src/interop/arrow/to_py.rs @@ -123,10 +123,15 @@ impl Iterator for DataFrameStreamIterator { .columns .iter() .map(|s| s.to_arrow(self.idx, CompatLevel::newest())) - .collect(); + .collect::>(); self.idx += 1; - let array = arrow::array::StructArray::new(self.dtype.clone(), batch_cols, None); + let array = arrow::array::StructArray::new( + self.dtype.clone(), + batch_cols[0].len(), + batch_cols, + None, + ); Some(Ok(Box::new(array))) } } diff --git a/crates/polars-python/src/interop/arrow/to_rust.rs b/crates/polars-python/src/interop/arrow/to_rust.rs index 809bd527a492..1add88c96fd8 100644 --- a/crates/polars-python/src/interop/arrow/to_rust.rs +++ b/crates/polars-python/src/interop/arrow/to_rust.rs @@ -105,7 +105,7 @@ pub fn to_rust_df(rb: &[Bound]) -> PyResult { }?; // no need to check as a record batch has the same guarantees - Ok(unsafe { DataFrame::new_no_checks(columns) }) + Ok(unsafe { DataFrame::new_no_checks_height_from_first(columns) }) }) .collect::>>()?; diff --git a/crates/polars-python/src/map/mod.rs b/crates/polars-python/src/map/mod.rs index ef1bb4e34507..e899004e57ee 100644 --- a/crates/polars-python/src/map/mod.rs +++ b/crates/polars-python/src/map/mod.rs @@ -122,10 +122,12 @@ fn iterator_to_struct<'a>( .collect::>() }); - Ok(StructChunked::from_series(name, fields.iter()) - .unwrap() - .into_series() - .into()) + Ok( + StructChunked::from_series(name, fields[0].len(), fields.iter()) + .unwrap() + .into_series() + .into(), + ) } fn iterator_to_primitive( diff --git a/crates/polars-python/src/series/construction.rs b/crates/polars-python/src/series/construction.rs index 5935f1e7b0ce..fa9380bd3023 100644 --- a/crates/polars-python/src/series/construction.rs +++ b/crates/polars-python/src/series/construction.rs @@ -187,6 +187,7 @@ impl PySeries { .iter()? .map(|v| py_object_to_any_value(&(v?).as_borrowed(), strict, true)) .collect::>>(); + dbg!(&any_values_result); let result = any_values_result.and_then(|avs| { let s = Series::from_any_values(name.into(), avs.as_slice(), strict).map_err(|e| { PyTypeError::new_err(format!( @@ -223,6 +224,7 @@ impl PySeries { strict: bool, ) -> PyResult { let avs = convert_to_avs(values, strict, false)?; + dbg!(&avs); let s = Series::from_any_values_and_dtype(name.into(), avs.as_slice(), &dtype.0, strict) .map_err(|e| { PyTypeError::new_err(format!( diff --git a/crates/polars-row/src/decode.rs b/crates/polars-row/src/decode.rs index 858ce3f55fcf..9dcac4a01711 100644 --- a/crates/polars-row/src/decode.rs +++ b/crates/polars-row/src/decode.rs @@ -64,7 +64,8 @@ unsafe fn decode(rows: &mut [&[u8]], field: &EncodingField, dtype: &ArrowDataTyp .iter() .map(|struct_fld| decode(rows, field, struct_fld.dtype())) .collect(); - StructArray::new(dtype.clone(), values, None).to_boxed() + dbg!("TODO: VERIFY"); + StructArray::new(dtype.clone(), rows.len(), values, None).to_boxed() }, dt => { with_match_arrow_primitive_type!(dt, |$T| { diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs index dc8fe611eafd..119345295686 100644 --- a/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs @@ -108,17 +108,22 @@ impl RowGroupDecoder { out_columns.push(file_path_series.slice(0, projection_height)); } - let df = unsafe { DataFrame::new_no_checks(out_columns) }; + let df = unsafe { DataFrame::new_no_checks(projection_height, out_columns) }; let df = if let Some(predicate) = self.physical_predicate.as_deref() { let mask = predicate.evaluate_io(&df)?; let mask = mask.bool().unwrap(); - unsafe { - DataFrame::new_no_checks( - filter_cols(df.take_columns(), mask, self.min_values_per_thread).await?, - ) - } + let filtered = + unsafe { filter_cols(df.take_columns(), mask, self.min_values_per_thread) }.await?; + + let height = if let Some(fst) = filtered.first() { + fst.len() + } else { + mask.num_trues() + }; + + unsafe { DataFrame::new_no_checks(height, filtered) } } else { df }; @@ -515,7 +520,9 @@ impl RowGroupDecoder { live_columns.push(s?); } - let live_df = unsafe { DataFrame::new_no_checks(live_columns) }; + let live_df = unsafe { + DataFrame::new_no_checks(row_group_data.row_group_metadata.num_rows(), live_columns) + }; let mask = self .physical_predicate .as_deref() @@ -523,12 +530,18 @@ impl RowGroupDecoder { .evaluate_io(&live_df)?; let mask = mask.bool().unwrap(); - let live_df_filtered = unsafe { - DataFrame::new_no_checks( - filter_cols(live_df.take_columns(), mask, self.min_values_per_thread).await?, - ) + let filtered = + unsafe { filter_cols(live_df.take_columns(), mask, self.min_values_per_thread) } + .await?; + + let height = if let Some(fst) = filtered.first() { + fst.len() + } else { + mask.num_trues() }; + let live_df_filtered = unsafe { DataFrame::new_no_checks(height, filtered) }; + let mask_bitmap = { let mut mask_bitmap = MutableBitmap::with_capacity(mask.len()); @@ -590,7 +603,7 @@ impl RowGroupDecoder { out_columns.extend(live_rem); // optional hive cols, file path col assert_eq!(dead_rem.len(), 0); - let df = unsafe { DataFrame::new_no_checks(out_columns) }; + let df = unsafe { DataFrame::new_no_checks(expected_num_rows, out_columns) }; Ok(self.split_to_morsels(df)) } } diff --git a/crates/polars/tests/it/arrow/array/growable/mod.rs b/crates/polars/tests/it/arrow/array/growable/mod.rs index 4510fd0749cd..648e5203263a 100644 --- a/crates/polars/tests/it/arrow/array/growable/mod.rs +++ b/crates/polars/tests/it/arrow/array/growable/mod.rs @@ -60,6 +60,7 @@ fn test_make_growable_extension() { ); let array = StructArray::new( dtype.clone(), + 2, vec![Int32Array::from_slice([1, 2]).boxed()], None, ); diff --git a/crates/polars/tests/it/arrow/array/growable/struct_.rs b/crates/polars/tests/it/arrow/array/growable/struct_.rs index 07f0403ee294..2749fa88bb1c 100644 --- a/crates/polars/tests/it/arrow/array/growable/struct_.rs +++ b/crates/polars/tests/it/arrow/array/growable/struct_.rs @@ -29,7 +29,7 @@ fn some_values() -> (ArrowDataType, Vec>) { fn basic() { let (fields, values) = some_values(); - let array = StructArray::new(fields.clone(), values.clone(), None); + let array = StructArray::new(fields.clone(), values[0].len(), values.clone(), None); let mut a = GrowableStruct::new(vec![&array], false, 0); @@ -41,6 +41,7 @@ fn basic() { let expected = StructArray::new( fields, + 2, vec![values[0].sliced(1, 2), values[1].sliced(1, 2)], None, ); @@ -51,7 +52,8 @@ fn basic() { fn offset() { let (fields, values) = some_values(); - let array = StructArray::new(fields.clone(), values.clone(), None).sliced(1, 3); + let array = + StructArray::new(fields.clone(), values[0].len(), values.clone(), None).sliced(1, 3); let mut a = GrowableStruct::new(vec![&array], false, 0); @@ -63,6 +65,7 @@ fn offset() { let expected = StructArray::new( fields, + 2, vec![values[0].sliced(2, 2), values[1].sliced(2, 2)], None, ); @@ -76,6 +79,7 @@ fn nulls() { let array = StructArray::new( fields.clone(), + values[0].len(), values.clone(), Some(Bitmap::from_u8_slice([0b00000010], 5)), ); @@ -90,6 +94,7 @@ fn nulls() { let expected = StructArray::new( fields, + 2, vec![values[0].sliced(1, 2), values[1].sliced(1, 2)], Some(Bitmap::from_u8_slice([0b00000010], 5).sliced(1, 2)), ); @@ -101,7 +106,7 @@ fn nulls() { fn many() { let (fields, values) = some_values(); - let array = StructArray::new(fields.clone(), values.clone(), None); + let array = StructArray::new(fields.clone(), values[0].len(), values.clone(), None); let mut mutable = GrowableStruct::new(vec![&array, &array], true, 0); @@ -132,6 +137,7 @@ fn many() { let expected = StructArray::new( fields, + expected_string.len(), vec![expected_string, expected_int], Some(Bitmap::from([true, true, true, true, false])), ); diff --git a/crates/polars/tests/it/arrow/array/map/mod.rs b/crates/polars/tests/it/arrow/array/map/mod.rs index 34e880578659..44702f118a89 100644 --- a/crates/polars/tests/it/arrow/array/map/mod.rs +++ b/crates/polars/tests/it/arrow/array/map/mod.rs @@ -13,6 +13,7 @@ fn array() -> MapArray { let field = StructArray::new( dt(), + 3, vec![ Box::new(Utf8Array::::from_slice(["a", "aa", "aaa"])) as _, Box::new(Utf8Array::::from_slice(["b", "bb", "bbb"])), @@ -36,6 +37,7 @@ fn basics() { array.value(0), Box::new(StructArray::new( dt(), + 1, vec![ Box::new(Utf8Array::::from_slice(["a"])) as _, Box::new(Utf8Array::::from_slice(["b"])), @@ -49,6 +51,7 @@ fn basics() { sliced.value(0), Box::new(StructArray::new( dt(), + 1, vec![ Box::new(Utf8Array::::from_slice(["aa"])) as _, Box::new(Utf8Array::::from_slice(["bb"])), @@ -66,6 +69,7 @@ fn split_at() { lhs.value(0), Box::new(StructArray::new( dt(), + 1, vec![ Box::new(Utf8Array::::from_slice(["a"])) as _, Box::new(Utf8Array::::from_slice(["b"])), @@ -77,6 +81,7 @@ fn split_at() { rhs.value(0), Box::new(StructArray::new( dt(), + 1, vec![ Box::new(Utf8Array::::from_slice(["aa"])) as _, Box::new(Utf8Array::::from_slice(["bb"])), @@ -88,6 +93,7 @@ fn split_at() { rhs.value(1), Box::new(StructArray::new( dt(), + 1, vec![ Box::new(Utf8Array::::from_slice(["aaa"])) as _, Box::new(Utf8Array::::from_slice(["bbb"])), diff --git a/crates/polars/tests/it/arrow/array/struct_/iterator.rs b/crates/polars/tests/it/arrow/array/struct_/iterator.rs index e4b6a7691ad0..7e3430e355d9 100644 --- a/crates/polars/tests/it/arrow/array/struct_/iterator.rs +++ b/crates/polars/tests/it/arrow/array/struct_/iterator.rs @@ -14,6 +14,7 @@ fn test_simple_iter() { let array = StructArray::new( ArrowDataType::Struct(fields), + boolean.len(), vec![boolean.clone(), int.clone()], None, ); diff --git a/crates/polars/tests/it/arrow/array/struct_/mod.rs b/crates/polars/tests/it/arrow/array/struct_/mod.rs index bd1a1c83086c..9492a1b6bdba 100644 --- a/crates/polars/tests/it/arrow/array/struct_/mod.rs +++ b/crates/polars/tests/it/arrow/array/struct_/mod.rs @@ -1,5 +1,4 @@ mod iterator; -mod mutable; use arrow::array::*; use arrow::bitmap::Bitmap; @@ -16,6 +15,7 @@ fn array() -> StructArray { StructArray::new( ArrowDataType::Struct(fields), + boolean.len(), vec![boolean.clone(), int.clone()], Some(Bitmap::from([true, true, false, true])), ) diff --git a/crates/polars/tests/it/arrow/array/struct_/mutable.rs b/crates/polars/tests/it/arrow/array/struct_/mutable.rs deleted file mode 100644 index 4a526a76391b..000000000000 --- a/crates/polars/tests/it/arrow/array/struct_/mutable.rs +++ /dev/null @@ -1,31 +0,0 @@ -use arrow::array::*; -use arrow::datatypes::{ArrowDataType, Field}; - -#[test] -fn push() { - let c1 = Box::new(MutablePrimitiveArray::::new()) as Box; - let values = vec![c1]; - let dtype = ArrowDataType::Struct(vec![Field::new("f1".into(), ArrowDataType::Int32, true)]); - let mut a = MutableStructArray::new(dtype, values); - - a.value::>(0) - .unwrap() - .push(Some(1)); - a.push(true); - a.value::>(0).unwrap().push(None); - a.push(false); - a.value::>(0) - .unwrap() - .push(Some(2)); - a.push(true); - - assert_eq!(a.len(), 3); - assert!(a.is_valid(0)); - assert!(!a.is_valid(1)); - assert!(a.is_valid(2)); - - assert_eq!( - a.value::>(0).unwrap().values(), - &Vec::from([1, 0, 2]) - ); -} diff --git a/crates/polars/tests/it/arrow/scalar/map.rs b/crates/polars/tests/it/arrow/scalar/map.rs index ee23cb47960f..3a12e8bffcd6 100644 --- a/crates/polars/tests/it/arrow/scalar/map.rs +++ b/crates/polars/tests/it/arrow/scalar/map.rs @@ -11,6 +11,7 @@ fn equal() { ]); let kv_array1 = StructArray::try_new( kv_dt.clone(), + 2, vec![ Utf8Array::::from([Some("k1"), Some("k2")]).boxed(), BooleanArray::from_slice([true, false]).boxed(), @@ -20,6 +21,7 @@ fn equal() { .unwrap(); let kv_array2 = StructArray::try_new( kv_dt.clone(), + 2, vec![ Utf8Array::::from([Some("k1"), Some("k3")]).boxed(), BooleanArray::from_slice([true, true]).boxed(), @@ -47,6 +49,7 @@ fn basics() { ]); let kv_array = StructArray::try_new( kv_dt.clone(), + 2, vec![ Utf8Array::::from([Some("k1"), Some("k2")]).boxed(), BooleanArray::from_slice([true, false]).boxed(), diff --git a/crates/polars/tests/it/io/avro/read.rs b/crates/polars/tests/it/io/avro/read.rs index dac9adbfc9d0..d03503af2049 100644 --- a/crates/polars/tests/it/io/avro/read.rs +++ b/crates/polars/tests/it/io/avro/read.rs @@ -110,12 +110,14 @@ pub(super) fn data() -> RecordBatchT> { array.into_box(), StructArray::new( ArrowDataType::Struct(vec![Field::new("e".into(), ArrowDataType::Float64, false)]), + 2, vec![PrimitiveArray::::from_slice([1.0, 2.0]).boxed()], None, ) .boxed(), StructArray::new( ArrowDataType::Struct(vec![Field::new("e".into(), ArrowDataType::Float64, false)]), + 2, vec![PrimitiveArray::::from_slice([1.0, 0.0]).boxed()], Some([true, false].into()), ) diff --git a/crates/polars/tests/it/io/avro/write.rs b/crates/polars/tests/it/io/avro/write.rs index 43011eb7a2bf..8aa1d5d8374b 100644 --- a/crates/polars/tests/it/io/avro/write.rs +++ b/crates/polars/tests/it/io/avro/write.rs @@ -268,6 +268,7 @@ fn struct_data() -> RecordBatchT> { RecordBatchT::new(vec![ Box::new(StructArray::new( struct_dt.clone(), + 2, vec![ Box::new(PrimitiveArray::::from_slice([1, 2])), Box::new(PrimitiveArray::::from([None, Some(1)])), @@ -276,6 +277,7 @@ fn struct_data() -> RecordBatchT> { )), Box::new(StructArray::new( struct_dt, + 2, vec![ Box::new(PrimitiveArray::::from_slice([1, 2])), Box::new(PrimitiveArray::::from([None, Some(1)])), diff --git a/crates/polars/tests/it/io/parquet/arrow/mod.rs b/crates/polars/tests/it/io/parquet/arrow/mod.rs index a54b5fcacb1c..fd01930944fc 100644 --- a/crates/polars/tests/it/io/parquet/arrow/mod.rs +++ b/crates/polars/tests/it/io/parquet/arrow/mod.rs @@ -19,6 +19,7 @@ use super::read::file::FileReader; fn new_struct( arrays: Vec>, + length: usize, names: Vec, validity: Option, ) -> StructArray { @@ -27,7 +28,7 @@ fn new_struct( .zip(arrays.iter()) .map(|(n, a)| Field::new(n.into(), a.dtype().clone(), true)) .collect(); - StructArray::new(ArrowDataType::Struct(fields), arrays, validity) + StructArray::new(ArrowDataType::Struct(fields), length, arrays, validity) } pub fn read_column(mut reader: R, column: &str) -> PolarsResult> { @@ -85,6 +86,7 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { ); StructArray::new( ArrowDataType::Struct(vec![Field::new("f1".into(), a.dtype().clone(), true)]), + a.len(), vec![a.boxed()], None, ) @@ -260,8 +262,11 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { Some("e"), ]) .boxed(); + + let len = array.len(); new_struct( vec![array], + len, vec!["a".to_string()], Some( [ @@ -322,8 +327,10 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { ) .boxed(); + let len = array.len(); new_struct( vec![array], + len, vec!["a".to_string()], Some( [ @@ -416,7 +423,10 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { let array: ListArray = a.into(); Box::new(array) }, - "struct_list_nullable" => new_struct(vec![values], vec!["a".to_string()], None).boxed(), + "struct_list_nullable" => { + let len = values.len(); + new_struct(vec![values], len, vec!["a".to_string()], None).boxed() + }, _ => { let field = match column { "list_int64" => Field::new("item".into(), ArrowDataType::Int64, true), @@ -809,6 +819,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { distinct_count: new_list( new_struct( vec![UInt64Array::from([None]).boxed()], + 1, vec!["a".to_string()], None, ) @@ -819,6 +830,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { null_count: new_list( new_struct( vec![UInt64Array::from([Some(4)]).boxed()], + 1, vec!["a".to_string()], None, ) @@ -829,6 +841,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { min_value: new_list( new_struct( vec![Utf8ViewArray::from_slice([Some("a")]).boxed()], + 1, vec!["a".to_string()], None, ) @@ -839,6 +852,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { max_value: new_list( new_struct( vec![Utf8ViewArray::from_slice([Some("e")]).boxed()], + 1, vec!["a".to_string()], None, ) @@ -851,6 +865,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { distinct_count: new_list( new_struct( vec![new_list(UInt64Array::from([None]).boxed(), true).boxed()], + 1, vec!["a".to_string()], None, ) @@ -861,6 +876,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { null_count: new_list( new_struct( vec![new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed()], + 1, vec!["a".to_string()], None, ) @@ -871,6 +887,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { min_value: new_list( new_struct( vec![new_list(Utf8ViewArray::from_slice([Some("a")]).boxed(), true).boxed()], + 1, vec!["a".to_string()], None, ) @@ -881,6 +898,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { max_value: new_list( new_struct( vec![new_list(Utf8ViewArray::from_slice([Some("d")]).boxed(), true).boxed()], + 1, vec!["a".to_string()], None, ) @@ -892,24 +910,28 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { "struct_list_nullable" => Statistics { distinct_count: new_struct( vec![new_list(UInt64Array::from([None]).boxed(), true).boxed()], + 1, vec!["a".to_string()], None, ) .boxed(), null_count: new_struct( vec![new_list(UInt64Array::from([Some(1)]).boxed(), true).boxed()], + 1, vec!["a".to_string()], None, ) .boxed(), min_value: new_struct( vec![new_list(Utf8ViewArray::from_slice([Some("")]).boxed(), true).boxed()], + 1, vec!["a".to_string()], None, ) .boxed(), max_value: new_struct( vec![new_list(Utf8ViewArray::from_slice([Some("ccc")]).boxed(), true).boxed()], + 1, vec!["a".to_string()], None, ) @@ -933,13 +955,13 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { ) }; - let new_struct = |arrays: Vec>, names: Vec| { + let new_struct = |arrays: Vec>, length: usize, names: Vec| { let fields = names .into_iter() .zip(arrays.iter()) .map(|(n, a)| Field::new(n.into(), a.dtype().clone(), true)) .collect(); - StructArray::new(ArrowDataType::Struct(fields), arrays, None) + StructArray::new(ArrowDataType::Struct(fields), length, arrays, None) }; let names = vec!["f1".to_string()]; @@ -960,20 +982,24 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { "struct_list_nullable" => Statistics { distinct_count: new_struct( vec![new_list(Box::new(UInt64Array::from([None]))).boxed()], + 1, names.clone(), ) .boxed(), null_count: new_struct( vec![new_list(Box::new(UInt64Array::from([Some(1)]))).boxed()], + 1, names.clone(), ) .boxed(), min_value: Box::new(new_struct( vec![new_list(Box::new(Utf8ViewArray::from_slice([Some("a")]))).boxed()], + 1, names.clone(), )), max_value: Box::new(new_struct( vec![new_list(Box::new(Utf8ViewArray::from_slice([Some("c")]))).boxed()], + 1, names, )), }, @@ -981,6 +1007,7 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { distinct_count: new_list( new_struct( vec![new_list(Box::new(UInt64Array::from([None]))).boxed()], + 1, names.clone(), ) .boxed(), @@ -989,6 +1016,7 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { null_count: new_list( new_struct( vec![new_list(Box::new(UInt64Array::from([Some(1)]))).boxed()], + 1, names.clone(), ) .boxed(), @@ -996,11 +1024,13 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { .boxed(), min_value: new_list(Box::new(new_struct( vec![new_list(Box::new(Utf8ViewArray::from_slice([Some("a")]))).boxed()], + 1, names.clone(), ))) .boxed(), max_value: new_list(Box::new(new_struct( vec![new_list(Box::new(Utf8ViewArray::from_slice([Some("c")]))).boxed()], + 1, names, ))) .boxed(), @@ -1045,12 +1075,23 @@ pub fn pyarrow_struct(column: &str) -> Box { Field::new("f2".into(), ArrowDataType::Boolean, true), ]; match column { - "struct" => { - StructArray::new(ArrowDataType::Struct(fields), vec![string, boolean], None).boxed() - }, + "struct" => StructArray::new( + ArrowDataType::Struct(fields), + string.len(), + vec![string, boolean], + None, + ) + .boxed(), "struct_nullable" => { + let len = string.len(); let values = vec![string, boolean]; - StructArray::new(ArrowDataType::Struct(fields), values, Some(mask.into())).boxed() + StructArray::new( + ArrowDataType::Struct(fields), + len, + values, + Some(mask.into()), + ) + .boxed() }, "struct_struct" => { let struct_ = pyarrow_struct("struct"); @@ -1059,6 +1100,7 @@ pub fn pyarrow_struct(column: &str) -> Box { Field::new("f1".into(), ArrowDataType::Struct(fields), true), Field::new("f2".into(), ArrowDataType::Boolean, true), ]), + struct_.len(), vec![struct_, boolean], None, )) @@ -1070,6 +1112,7 @@ pub fn pyarrow_struct(column: &str) -> Box { Field::new("f1".into(), ArrowDataType::Struct(fields), true), Field::new("f2".into(), ArrowDataType::Boolean, true), ]), + struct_.len(), vec![struct_, boolean], Some(mask.into()), )) @@ -1079,8 +1122,9 @@ pub fn pyarrow_struct(column: &str) -> Box { } pub fn pyarrow_struct_statistics(column: &str) -> Statistics { - let new_struct = - |arrays: Vec>, names: Vec| new_struct(arrays, names, None); + let new_struct = |arrays: Vec>, length: usize, names: Vec| { + new_struct(arrays, length, names, None) + }; let names = vec!["f1".to_string(), "f2".to_string()]; @@ -1091,6 +1135,7 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Box::new(UInt64Array::from([None])), Box::new(UInt64Array::from([Some(2)])), ], + 1, names.clone(), ) .boxed(), @@ -1099,6 +1144,7 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Box::new(UInt64Array::from([Some(4)])), Box::new(UInt64Array::from([Some(4)])), ], + 1, names.clone(), ) .boxed(), @@ -1107,6 +1153,7 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Box::new(Utf8ViewArray::from_slice([Some("")])), Box::new(BooleanArray::from_slice([false])), ], + 1, names.clone(), )), max_value: Box::new(new_struct( @@ -1114,6 +1161,7 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Box::new(Utf8ViewArray::from_slice([Some("def")])), Box::new(BooleanArray::from_slice([true])), ], + 1, names, )), }, @@ -1125,11 +1173,13 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Box::new(UInt64Array::from([None])), Box::new(UInt64Array::from([Some(2)])), ], + 1, names.clone(), ) .boxed(), UInt64Array::from([None]).boxed(), ], + 1, names.clone(), ) .boxed(), @@ -1140,11 +1190,13 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Box::new(UInt64Array::from([Some(4)])), Box::new(UInt64Array::from([Some(4)])), ], + 1, names.clone(), ) .boxed(), UInt64Array::from([Some(4)]).boxed(), ], + 1, names.clone(), ) .boxed(), @@ -1155,11 +1207,13 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Utf8ViewArray::from_slice([Some("")]).boxed(), BooleanArray::from_slice([false]).boxed(), ], + 1, names.clone(), ) .boxed(), BooleanArray::from_slice([false]).boxed(), ], + 1, names.clone(), ) .boxed(), @@ -1170,11 +1224,13 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Utf8ViewArray::from_slice([Some("def")]).boxed(), BooleanArray::from_slice([true]).boxed(), ], + 1, names.clone(), ) .boxed(), BooleanArray::from_slice([true]).boxed(), ], + 1, names, ) .boxed(), @@ -1187,11 +1243,13 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Box::new(UInt64Array::from([None])), Box::new(UInt64Array::from([None])), ], + 1, names.clone(), ) .boxed(), UInt64Array::from([None]).boxed(), ], + 1, names.clone(), ) .boxed(), @@ -1202,11 +1260,13 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Box::new(UInt64Array::from([Some(5)])), Box::new(UInt64Array::from([Some(5)])), ], + 1, names.clone(), ) .boxed(), UInt64Array::from([Some(5)]).boxed(), ], + 1, names.clone(), ) .boxed(), @@ -1217,11 +1277,13 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Utf8ViewArray::from_slice([Some("")]).boxed(), BooleanArray::from_slice([false]).boxed(), ], + 1, names.clone(), ) .boxed(), BooleanArray::from_slice([false]).boxed(), ], + 1, names.clone(), ) .boxed(), @@ -1232,11 +1294,13 @@ pub fn pyarrow_struct_statistics(column: &str) -> Statistics { Utf8ViewArray::from_slice([Some("def")]).boxed(), BooleanArray::from_slice([true]).boxed(), ], + 1, names.clone(), ) .boxed(), BooleanArray::from_slice([true]).boxed(), ], + 1, names, ) .boxed(), diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 5caa7c338e10..ecb20380ef1f 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -619,7 +619,7 @@ def test_struct_categorical_5843() -> None: def test_empty_struct() -> None: # List df = pl.DataFrame({"a": [[{}]]}) - assert df.to_dict(as_series=False) == {"a": [[None]]} + assert df.to_dict(as_series=False) == {"a": [[{}]]} # Struct one not empty df = pl.DataFrame({"a": [[{}, {"a": 10}]]}) @@ -627,7 +627,7 @@ def test_empty_struct() -> None: # Empty struct df = pl.DataFrame({"a": [{}]}) - assert df.to_dict(as_series=False) == {"a": [None]} + assert df.to_dict(as_series=False) == {"a": [{}]} @pytest.mark.parametrize( diff --git a/py-polars/tests/unit/io/test_json.py b/py-polars/tests/unit/io/test_json.py index 93780e79293d..f76bec9af2d2 100644 --- a/py-polars/tests/unit/io/test_json.py +++ b/py-polars/tests/unit/io/test_json.py @@ -160,10 +160,8 @@ def test_ndjson_nested_null() -> None: # 'bar' represents an empty list of structs; check the schema is correct (eg: picks # up that it IS a list of structs), but confirm that list is empty (ref: #11301) # We don't support empty structs yet. So Null is closest. - assert df.schema == { - "foo": pl.Struct([pl.Field("bar", pl.List(pl.Struct({"": pl.Null})))]) - } - assert df.to_dict(as_series=False) == {"foo": [{"bar": []}]} + assert df.schema == {"foo": pl.Struct([pl.Field("bar", pl.List(pl.Struct({})))])} + assert df.to_dict(as_series=False) == {"foo": [{"bar": [{}]}]} def test_ndjson_nested_string_int() -> None: @@ -289,7 +287,7 @@ def test_ndjson_null_buffer() -> None: ("id", pl.Int64), ("zero_column", pl.Int64), ("empty_array_column", pl.List(pl.Null)), - ("empty_object_column", pl.Struct([pl.Field("", pl.Null)])), + ("empty_object_column", pl.Struct([])), ("null_column", pl.Null), ] ) @@ -388,7 +386,7 @@ def test_empty_json() -> None: df = pl.read_json(b'{"j":{}}') assert df.dtypes == [pl.Struct([])] - assert df.shape == (0, 1) + assert df.shape == (1, 1) def test_compressed_json() -> None: From 118a25e7a6d3fe633b13beefc8a9267e52b79b37 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Mon, 7 Oct 2024 17:08:19 +0200 Subject: [PATCH 02/17] all tests working still some points to clean up [skip ci] --- .../src/chunked_array/comparison/mod.rs | 3 +- .../src/chunked_array/struct_/mod.rs | 4 +-- crates/polars-core/src/frame/horizontal.rs | 29 +++++++++++----- crates/polars-core/src/frame/mod.rs | 33 ++++++++----------- crates/polars-core/src/frame/row/dataframe.rs | 6 ++++ crates/polars-core/src/series/any_value.rs | 4 --- .../src/executors/group_by_partitioned.rs | 2 +- .../src/chunked_array/strings/split.rs | 4 +-- crates/polars-ops/src/frame/pivot/unpivot.rs | 1 + .../polars-python/src/series/construction.rs | 2 -- py-polars/tests/unit/datatypes/test_struct.py | 26 +++++++++++++++ 11 files changed, 73 insertions(+), 41 deletions(-) diff --git a/crates/polars-core/src/chunked_array/comparison/mod.rs b/crates/polars-core/src/chunked_array/comparison/mod.rs index 0a58d5192d41..209278f5cde8 100644 --- a/crates/polars-core/src/chunked_array/comparison/mod.rs +++ b/crates/polars-core/src/chunked_array/comparison/mod.rs @@ -772,13 +772,14 @@ where BooleanChunked::full(PlSmallStr::EMPTY, value, a.len()) } else { let (a, b) = align_chunks_binary(a, b); + let mut out = a .fields_as_series() .iter() .zip(b.fields_as_series().iter()) .map(|(l, r)| op(l, r)) .reduce(reduce) - .unwrap(); + .unwrap_or_else(|| BooleanChunked::full(PlSmallStr::EMPTY, !value, a.len())); if !is_missing && (a.null_count() > 0 || b.null_count() > 0) { let mut a = a.into_owned(); diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index e2914dd823d5..b098d6e7fe05 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -374,10 +374,8 @@ impl StructChunked { .map(Column::from) .collect::>(); - let height = DataFrame::infer_height(&columns); - // SAFETY: invariants for struct are the same - unsafe { DataFrame::new_no_checks(height, columns) } + unsafe { DataFrame::new_no_checks(self.len(), columns) } } /// Get access to one of this `[StructChunked]`'s fields diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index 31c072991d87..0886c6b3f958 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -32,6 +32,15 @@ impl DataFrame { /// - the length of all [`Column`] is equal to the height of this [`DataFrame`] /// - the columns names are unique pub unsafe fn hstack_mut_unchecked(&mut self, columns: &[Column]) -> &mut Self { + // If we don't have any columns yet, copy the height from the given columns. + if let Some(fst) = columns.first() { + if self.width() == 0 { + // SAFETY: The functions invariants asks for all columns to be the same length so + // that makes that a valid height. + unsafe { self.set_height(fst.len()) }; + } + } + self.columns.extend_from_slice(columns); self } @@ -68,7 +77,7 @@ impl DataFrame { /// Concat [`DataFrame`]s horizontally. /// Concat horizontally and extend with null values if lengths don't match pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> PolarsResult { - let max_len = dfs + let output_height = dfs .iter() .map(|df| df.height()) .max() @@ -77,18 +86,22 @@ pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> Polars let owned_df; // if not all equal length, extend the DataFrame with nulls - let dfs = if !dfs.iter().all(|df| df.height() == max_len) { + let dfs = if !dfs.iter().all(|df| df.height() == output_height) { owned_df = dfs .iter() .cloned() .map(|mut df| { - if df.height() != max_len { - let diff = max_len - df.height(); - df.columns.iter_mut().for_each(|s| { - // @scalar-opt - let s = s.into_materialized_series(); - *s = s.extend_constant(AnyValue::Null, diff).unwrap() + if df.height() != output_height { + let diff = output_height - df.height(); + + // SAFETY: We extend each column with nulls to the point of being of length + // `output_height`. Then, we set the height of the resulting dataframe. + unsafe { df.get_columns_mut() }.iter_mut().for_each(|c| { + *c = c.extend_constant(AnyValue::Null, diff).unwrap(); }); + unsafe { + df.set_height(output_height); + } } df }) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 7f7b0d8be6a3..3ec3852d2093 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -497,14 +497,7 @@ impl DataFrame { /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame /// constructed with this method is generally highly unsafe and should not be long-lived. #[allow(clippy::missing_safety_doc)] - pub unsafe fn _new_no_checks_impl(height: usize, columns: Vec) -> DataFrame { - if cfg!(debug_assertions) && std::env::var("POLARS_VERIFY_COL_LENGTHS").is_ok() { - dbg!("Make const"); - for col in &columns { - assert_eq!(col.len(), height); - } - } - + pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec) -> DataFrame { DataFrame { height, columns } } @@ -682,13 +675,7 @@ impl DataFrame { #[inline] /// Extend the columns without checking for name collisions or height. pub unsafe fn column_extend_unchecked(&mut self, iter: impl Iterator) { - if cfg!(debug_assertions) { - for c in iter { - unsafe { self.with_column_unchecked(c) }; - } - } else { - unsafe { self.get_columns_mut() }.extend(iter) - } + unsafe { self.get_columns_mut() }.extend(iter) } /// Take ownership of the underlying columns vec. @@ -1767,14 +1754,22 @@ impl DataFrame { Ok(selected) } - fn filter_height(filtered: &[Column], mask: &BooleanChunked) -> usize { + fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize { // If there is a filtered column just see how many columns there are left. if let Some(fst) = filtered.first() { return fst.len(); } // Otherwise, count the number of values that would be filtered and return that height. - mask.num_trues() + let num_trues = mask.num_trues(); + let num_filtered = if mask.len() == self.height() { + num_trues + } else { + debug_assert!(num_trues == 0 || num_trues == 1); + self.height() * num_trues + }; + + num_filtered } /// Take the [`DataFrame`] rows by a boolean mask. @@ -1790,7 +1785,7 @@ impl DataFrame { /// ``` pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult { let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?; - let height = Self::filter_height(&new_col, mask); + let height = self.filter_height(&new_col, mask); Ok(unsafe { DataFrame::new_no_checks(height, new_col) }) } @@ -1798,7 +1793,7 @@ impl DataFrame { /// Same as `filter` but does not parallelize. pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult { let new_col = self.try_apply_columns(&|s| s.filter(mask))?; - let height = Self::filter_height(&new_col, mask); + let height = self.filter_height(&new_col, mask); Ok(unsafe { DataFrame::new_no_checks(height, new_col) }) } diff --git a/crates/polars-core/src/frame/row/dataframe.rs b/crates/polars-core/src/frame/row/dataframe.rs index 1d11dcd9ecc0..97891c76d478 100644 --- a/crates/polars-core/src/frame/row/dataframe.rs +++ b/crates/polars-core/src/frame/row/dataframe.rs @@ -51,6 +51,12 @@ impl DataFrame { where I: Iterator>, { + if schema.is_empty() { + let height = rows.count(); + let columns = Vec::new(); + return Ok(unsafe { DataFrame::new_no_checks(height, columns) }); + } + let capacity = rows.size_hint().0; let mut buffers: Vec<_> = schema diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index 8f669132dc49..b7ac6d091ec3 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -660,14 +660,10 @@ fn any_values_to_list( _ => inner_type.clone(), }; - dbg!(&list_inner_type); - let mut builder = get_list_builder(&list_inner_type, capacity * 5, capacity, PlSmallStr::EMPTY)?; for av in avs { - dbg!(&av); - match av { AnyValue::List(b) => match b.cast(inner_type) { Ok(casted) => { diff --git a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs index c8f1726e57fd..57bb99c0fad9 100644 --- a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs +++ b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs @@ -156,7 +156,7 @@ fn estimate_unique_count(keys: &[Column], mut sample_size: usize) -> PolarsResul .map(|s| s.slice(offset, sample_size)) .map(Column::from) .collect::>(); - let df = unsafe { DataFrame::new_no_checks(sample_size, keys) }; + let df = unsafe { DataFrame::new_no_checks_height_from_first(keys) }; let names = df.get_column_names().into_iter().cloned(); let gb = df.group_by(names).unwrap(); Ok(finish(gb.get_groups())) diff --git a/crates/polars-ops/src/chunked_array/strings/split.rs b/crates/polars-ops/src/chunked_array/strings/split.rs index 98faaad75ed0..fce94fa1ad09 100644 --- a/crates/polars-ops/src/chunked_array/strings/split.rs +++ b/crates/polars-ops/src/chunked_array/strings/split.rs @@ -141,7 +141,6 @@ where }) } - let len = arrs[0].len(); let fields = arrs .into_iter() .enumerate() @@ -150,8 +149,7 @@ where }) .collect::>(); - dbg!("TODO: Properly implement"); - StructChunked::from_series(ca.name().clone(), len, fields.iter()) + StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter()) } pub fn split_helper<'a, F, I>(ca: &'a StringChunked, by: &'a StringChunked, op: F) -> ListChunked diff --git a/crates/polars-ops/src/frame/pivot/unpivot.rs b/crates/polars-ops/src/frame/pivot/unpivot.rs index 7715659b3c61..60f5bff8eae9 100644 --- a/crates/polars-ops/src/frame/pivot/unpivot.rs +++ b/crates/polars-ops/src/frame/pivot/unpivot.rs @@ -112,6 +112,7 @@ pub trait UnpivotDF: IntoDf { let value_col = Column::new_empty(value_name, &DataType::Null); let mut out = self_.select(index).unwrap().clear().take_columns(); + out.push(variable_col); out.push(value_col); diff --git a/crates/polars-python/src/series/construction.rs b/crates/polars-python/src/series/construction.rs index fa9380bd3023..5935f1e7b0ce 100644 --- a/crates/polars-python/src/series/construction.rs +++ b/crates/polars-python/src/series/construction.rs @@ -187,7 +187,6 @@ impl PySeries { .iter()? .map(|v| py_object_to_any_value(&(v?).as_borrowed(), strict, true)) .collect::>>(); - dbg!(&any_values_result); let result = any_values_result.and_then(|avs| { let s = Series::from_any_values(name.into(), avs.as_slice(), strict).map_err(|e| { PyTypeError::new_err(format!( @@ -224,7 +223,6 @@ impl PySeries { strict: bool, ) -> PyResult { let avs = convert_to_avs(values, strict, false)?; - dbg!(&avs); let s = Series::from_any_values_and_dtype(name.into(), avs.as_slice(), &dtype.0, strict) .map_err(|e| { PyTypeError::new_err(format!( diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index ecb20380ef1f..3371db3bae66 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -1048,3 +1048,29 @@ def test_struct_null_zip() -> None: df.select(pl.when(pl.Series([True])).then(pl.col.int).otherwise(pl.col.int)), pl.Series("int", [], dtype=pl.Struct({"x": pl.Int64})).to_frame(), ) + + +@pytest.mark.parametrize("size", [0, 1, 2, 5, 9, 13, 42]) +def test_zfs_construction(size: int) -> None: + a = pl.Series("a", [{}] * size, pl.Struct([])) + assert a.len() == size + + +@pytest.mark.parametrize("size", [0, 1, 2, 13]) +def test_zfs_unnest(size: int) -> None: + a = pl.Series("a", [{}] * size, pl.Struct([])).struct.unnest() + assert a.height == size + assert a.width == 0 + + +@pytest.mark.parametrize("size", [0, 1, 2, 13]) +def test_zfs_equality(size: int) -> None: + a = pl.Series("a", [{}] * size, pl.Struct([])) + b = pl.Series("a", [{}] * size, pl.Struct([])) + + assert_series_equal(a, b) + + assert_frame_equal( + a.to_frame(), + b.to_frame(), + ) From 7261a5b83af6a0c886d4d577f9bc328f9ca0c70a Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 09:54:40 +0200 Subject: [PATCH 03/17] add serialization --- crates/polars-core/src/serde/chunked_array.rs | 3 +- crates/polars-core/src/serde/series.rs | 37 +++++++++++++------ 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/crates/polars-core/src/serde/chunked_array.rs b/crates/polars-core/src/serde/chunked_array.rs index 145f05c9af38..7a643d17185e 100644 --- a/crates/polars-core/src/serde/chunked_array.rs +++ b/crates/polars-core/src/serde/chunked_array.rs @@ -173,9 +173,10 @@ impl Serialize for StructChunked { )); } - let mut state = serializer.serialize_map(Some(3))?; + let mut state = serializer.serialize_map(Some(4))?; state.serialize_entry("name", self.name())?; state.serialize_entry("datatype", self.dtype())?; + state.serialize_entry("length", &self.len())?; state.serialize_entry("values", &self.fields_as_series())?; state.end() } diff --git a/crates/polars-core/src/serde/series.rs b/crates/polars-core/src/serde/series.rs index b9a70f44cbd2..3b2415d4d5c2 100644 --- a/crates/polars-core/src/serde/series.rs +++ b/crates/polars-core/src/serde/series.rs @@ -87,8 +87,8 @@ impl Serialize for Series { )), dt => { with_match_physical_numeric_polars_type!(dt, |$T| { - let ca: &ChunkedArray<$T> = self.as_ref().as_ref().as_ref(); - ca.serialize(serializer) + let ca: &ChunkedArray<$T> = self.as_ref().as_ref().as_ref(); + ca.serialize(serializer) }) }, } @@ -100,7 +100,7 @@ impl<'de> Deserialize<'de> for Series { where D: Deserializer<'de>, { - const FIELDS: &[&str] = &["name", "datatype", "bit_settings", "values"]; + const FIELDS: &[&str] = &["name", "datatype", "bit_settings", "length", "values"]; struct SeriesVisitor; @@ -109,7 +109,7 @@ impl<'de> Deserialize<'de> for Series { fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { formatter - .write_str("struct {name: , datatype: , bit_settings?: , values: }") + .write_str("struct {name: , datatype: , bit_settings?: , length?: , values: }") } fn visit_map(self, mut map: A) -> std::result::Result @@ -118,6 +118,7 @@ impl<'de> Deserialize<'de> for Series { { let mut name: Option> = None; let mut dtype = None; + let mut length = None; let mut bit_settings: Option = None; let mut values_set = false; while let Some(key) = map.next_key::>().unwrap() { @@ -134,6 +135,8 @@ impl<'de> Deserialize<'de> for Series { "bit_settings" => { bit_settings = Some(map.next_value()?); }, + // length is only used for struct at the moment + "length" => length = Some(map.next_value()?), "values" => { // we delay calling next_value until we know the dtype values_set = true; @@ -275,15 +278,25 @@ impl<'de> Deserialize<'de> for Series { Ok(Series::new(name, values)) }, #[cfg(feature = "dtype-struct")] - DataType::Struct(_) => { + DataType::Struct(fields) => { + let length = length.ok_or_else(|| de::Error::missing_field("length"))?; let values: Vec = map.next_value()?; - dbg!("TODO: Fix"); - let ca = StructChunked::from_series( - name.clone(), - values[0].len(), - values.iter(), - ) - .unwrap(); + + if fields.len() != values.len() { + let expected = format!("expected {} value series", fields.len()); + let expected = expected.as_str(); + return Err(de::Error::invalid_length(values.len(), &expected)); + } + + for (f, v) in fields.iter().zip(values.iter()) { + if f.dtype() != v.dtype() { + let err = format!("type mismatch for struct. expected: {}, given: {}", f.dtype(), v.dtype()); + return Err(de::Error::custom(err)); + } + } + + let ca = StructChunked::from_series(name.clone(), length, values.iter()) + .unwrap(); let mut s = ca.into_series(); s.rename(name); Ok(s) From cf5c63b39f7c63d35397e86544d511eeeb23ab71 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 13:23:38 +0200 Subject: [PATCH 04/17] add tests, fix ipc, avro and arrow --- .../src/io/avro/read/deserialize.rs | 2 + .../src/io/ipc/read/array/struct_.rs | 10 ++- crates/polars-arrow/src/io/ipc/read/common.rs | 15 ++++- crates/polars-arrow/src/io/ipc/read/schema.rs | 3 - crates/polars-arrow/src/legacy/conversion.rs | 1 - crates/polars-arrow/src/mmap/mod.rs | 34 ++++------ crates/polars-arrow/src/record_batch.rs | 43 ++++++------ crates/polars-core/src/frame/mod.rs | 65 ++++++++++++------- crates/polars-core/src/series/into.rs | 2 +- crates/polars-io/src/ipc/mmap.rs | 3 +- crates/polars-python/src/dataframe/export.rs | 3 +- crates/polars-row/src/decode.rs | 1 - crates/polars/tests/it/arrow/io/ipc/mod.rs | 6 +- crates/polars/tests/it/io/avro/read.rs | 8 ++- crates/polars/tests/it/io/avro/write.rs | 8 +-- .../polars/tests/it/io/parquet/arrow/mod.rs | 13 ++-- .../polars/tests/it/io/parquet/arrow/write.rs | 2 +- .../tests/it/io/parquet/read/row_group.rs | 3 +- .../polars/tests/it/io/parquet/roundtrip.rs | 2 +- py-polars/tests/unit/datatypes/test_struct.py | 61 +++++++++++++++++ py-polars/tests/unit/io/test_ipc.py | 52 ++++++++++++++- py-polars/tests/unit/io/test_json.py | 12 ++++ 22 files changed, 242 insertions(+), 107 deletions(-) diff --git a/crates/polars-arrow/src/io/avro/read/deserialize.rs b/crates/polars-arrow/src/io/avro/read/deserialize.rs index dd22eff764d9..e21d67598e0c 100644 --- a/crates/polars-arrow/src/io/avro/read/deserialize.rs +++ b/crates/polars-arrow/src/io/avro/read/deserialize.rs @@ -508,7 +508,9 @@ pub fn deserialize( }? } } + RecordBatchT::try_new( + rows, arrays .iter_mut() .zip(projection.iter()) diff --git a/crates/polars-arrow/src/io/ipc/read/array/struct_.rs b/crates/polars-arrow/src/io/ipc/read/array/struct_.rs index e2a99d2b5e24..a4b72af817e9 100644 --- a/crates/polars-arrow/src/io/ipc/read/array/struct_.rs +++ b/crates/polars-arrow/src/io/ipc/read/array/struct_.rs @@ -1,12 +1,13 @@ use std::collections::VecDeque; use std::io::{Read, Seek}; -use polars_error::{polars_ensure, polars_err, PolarsResult}; +use polars_error::{polars_err, PolarsResult}; use super::super::super::IpcField; use super::super::deserialize::{read, skip}; use super::super::read_basic::*; use super::super::{Compression, Dictionaries, IpcBuffer, Node, Version}; +use super::try_get_array_length; use crate::array::StructArray; use crate::datatypes::ArrowDataType; use crate::io::ipc::read::array::try_get_field_node; @@ -28,6 +29,7 @@ pub fn read_struct( scratch: &mut Vec, ) -> PolarsResult { let field_node = try_get_field_node(field_nodes, &dtype)?; + let length = try_get_array_length(field_node, limit)?; let validity = read_validity( buffers, @@ -41,10 +43,6 @@ pub fn read_struct( )?; let fields = StructArray::get_fields(&dtype); - polars_ensure!( - !fields.is_empty(), - nyi = "Cannot read zero field structs from IPC" - ); let values = fields .iter() @@ -68,7 +66,7 @@ pub fn read_struct( }) .collect::>>()?; - StructArray::try_new(dtype, values[0].len(), values, validity) + StructArray::try_new(dtype, length, values, validity) } pub fn skip_struct( diff --git a/crates/polars-arrow/src/io/ipc/read/common.rs b/crates/polars-arrow/src/io/ipc/read/common.rs index 2458cba702b9..fbb9155149bd 100644 --- a/crates/polars-arrow/src/io/ipc/read/common.rs +++ b/crates/polars-arrow/src/io/ipc/read/common.rs @@ -188,7 +188,16 @@ pub fn read_record_batch( }) .collect::>>()? }; - RecordBatchT::try_new(columns) + + let length = batch + .length() + .map_err(|_| polars_err!(oos = OutOfSpecKind::MissingData)) + .unwrap() + .try_into() + .map_err(|_| polars_err!(oos = OutOfSpecKind::NegativeFooterLength))?; + let length = limit.map(|limit| limit.min(length)).unwrap_or(length); + + RecordBatchT::try_new(length, columns) } fn find_first_dict_field_d<'a>( @@ -353,6 +362,8 @@ pub fn apply_projection( chunk: RecordBatchT>, map: &PlHashMap, ) -> RecordBatchT> { + let length = chunk.len(); + // re-order according to projection let arrays = chunk.into_arrays(); let mut new_arrays = arrays.clone(); @@ -360,7 +371,7 @@ pub fn apply_projection( map.iter() .for_each(|(old, new)| new_arrays[*new] = arrays[*old].clone()); - RecordBatchT::new(new_arrays) + RecordBatchT::new(length, new_arrays) } #[cfg(test)] diff --git a/crates/polars-arrow/src/io/ipc/read/schema.rs b/crates/polars-arrow/src/io/ipc/read/schema.rs index 7fe6141e9b14..3e665a884eed 100644 --- a/crates/polars-arrow/src/io/ipc/read/schema.rs +++ b/crates/polars-arrow/src/io/ipc/read/schema.rs @@ -157,9 +157,6 @@ fn deserialize_struct(field: FieldRef) -> PolarsResult<(ArrowDataType, IpcField) let fields = field .children()? .ok_or_else(|| polars_err!(oos = "IPC: Struct must contain children"))?; - if fields.is_empty() { - polars_bail!(oos = "IPC: Struct must contain at least one child"); - } let (fields, ipc_fields) = try_unzip_vec(fields.iter().map(|field| { let (field, fields) = deserialize_field(field?)?; Ok((field, fields)) diff --git a/crates/polars-arrow/src/legacy/conversion.rs b/crates/polars-arrow/src/legacy/conversion.rs index 0d1ef3ca2b99..9504e45d878c 100644 --- a/crates/polars-arrow/src/legacy/conversion.rs +++ b/crates/polars-arrow/src/legacy/conversion.rs @@ -5,7 +5,6 @@ use crate::types::NativeType; pub fn chunk_to_struct(chunk: RecordBatchT, fields: Vec) -> StructArray { let dtype = ArrowDataType::Struct(fields); - dbg!("TODO: VERIFY"); StructArray::new(dtype, chunk.len(), chunk.into_arrays(), None) } diff --git a/crates/polars-arrow/src/mmap/mod.rs b/crates/polars-arrow/src/mmap/mod.rs index b934c31de563..37baba59e9df 100644 --- a/crates/polars-arrow/src/mmap/mod.rs +++ b/crates/polars-arrow/src/mmap/mod.rs @@ -71,7 +71,7 @@ fn get_buffers_nodes(batch: RecordBatchRef) -> PolarsResult<(VecDeque Ok((buffers, field_nodes)) } -unsafe fn _mmap_record>( +unsafe fn mmap_record>( fields: &ArrowSchema, ipc_fields: &[IpcField], data: Arc, @@ -86,6 +86,13 @@ unsafe fn _mmap_record>( .map(|v| v.iter().map(|v| v as usize).collect::>()) .unwrap_or_else(VecDeque::new); + let length = batch + .length() + .map_err(|_| polars_err!(oos = OutOfSpecKind::MissingData)) + .unwrap() + .try_into() + .map_err(|_| polars_err!(oos = OutOfSpecKind::NegativeFooterLength))?; + fields .iter_values() .map(|f| &f.dtype) @@ -104,26 +111,7 @@ unsafe fn _mmap_record>( ) }) .collect::>() - .and_then(RecordBatchT::try_new) -} - -unsafe fn _mmap_unchecked>( - fields: &ArrowSchema, - ipc_fields: &[IpcField], - data: Arc, - block: Block, - dictionaries: &Dictionaries, -) -> PolarsResult>> { - let (message, offset) = read_message(data.as_ref().as_ref(), block)?; - let batch = get_record_batch(message)?; - _mmap_record( - fields, - ipc_fields, - data.clone(), - batch, - offset, - dictionaries, - ) + .and_then(|arr| RecordBatchT::try_new(length, arr)) } /// Memory maps an record batch from an IPC file into a [`RecordBatchT`]. @@ -147,7 +135,7 @@ pub unsafe fn mmap_unchecked>( let (message, offset) = read_message(data.as_ref().as_ref(), block)?; let batch = get_record_batch(message)?; - _mmap_record( + mmap_record( &metadata.schema, &metadata.ipc_schema.fields, data.clone(), @@ -188,7 +176,7 @@ unsafe fn mmap_dictionary>( // Make a fake schema for the dictionary batch. let field = Field::new(PlSmallStr::EMPTY, value_type.clone(), false); - let chunk = _mmap_record( + let chunk = mmap_record( &std::iter::once((field.name.clone(), field)).collect(), &[first_ipc_field.clone()], data.clone(), diff --git a/crates/polars-arrow/src/record_batch.rs b/crates/polars-arrow/src/record_batch.rs index b11fd6c899a2..a0c6bbb6c290 100644 --- a/crates/polars-arrow/src/record_batch.rs +++ b/crates/polars-arrow/src/record_batch.rs @@ -1,7 +1,7 @@ //! Contains [`RecordBatchT`], a container of [`Array`] where every array has the //! same length. -use polars_error::{polars_bail, PolarsResult}; +use polars_error::{polars_ensure, PolarsResult}; use crate::array::{Array, ArrayRef}; @@ -9,6 +9,7 @@ use crate::array::{Array, ArrayRef}; /// the same length, [`RecordBatchT::len`]. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RecordBatchT> { + length: usize, arrays: Vec, } @@ -16,29 +17,26 @@ pub type RecordBatch = RecordBatchT; impl> RecordBatchT { /// Creates a new [`RecordBatchT`]. - /// # Panic - /// Iff the arrays do not have the same length - pub fn new(arrays: Vec) -> Self { - Self::try_new(arrays).unwrap() + /// + /// # Panics + /// + /// I.f.f. the length does not match the length of any of the arrays + pub fn new(length: usize, arrays: Vec) -> Self { + Self::try_new(length, arrays).unwrap() } /// Creates a new [`RecordBatchT`]. + /// /// # Error - /// Iff the arrays do not have the same length - pub fn try_new(arrays: Vec) -> PolarsResult { - if !arrays.is_empty() { - let len = arrays.first().unwrap().as_ref().len(); - if arrays - .iter() - .map(|array| array.as_ref()) - .any(|array| array.len() != len) - { - polars_bail!(ComputeError: - "RecordBatch requires all its arrays to have an equal number of rows".to_string(), - ); - } - } - Ok(Self { arrays }) + /// + /// I.f.f. the length does not match the length of any of the arrays + pub fn try_new(length: usize, arrays: Vec) -> PolarsResult { + polars_ensure!( + arrays.iter().all(|arr| arr.as_ref().len() == length), + ComputeError: "RecordBatch requires all its arrays to have an equal number of rows".to_string(), + ); + + Ok(Self { length, arrays }) } /// returns the [`Array`]s in [`RecordBatchT`] @@ -53,10 +51,7 @@ impl> RecordBatchT { /// returns the number of rows of every array pub fn len(&self) -> usize { - self.arrays - .first() - .map(|x| x.as_ref().len()) - .unwrap_or_default() + self.length } /// returns whether the columns have any rows diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 3ec3852d2093..308e5ccda3bb 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -467,6 +467,11 @@ impl DataFrame { /// `Series`. /// /// Calculates the height from the first column or `0` if no columns are given. + /// + /// # Safety + /// + /// It is the callers responsibility to uphold the contract of all `Series` + /// having an equal length and a unique name, if not this may panic down the line. pub unsafe fn new_no_checks_height_from_first(columns: Vec) -> DataFrame { let height = columns.first().map_or(0, Column::len); unsafe { Self::new_no_checks(height, columns) } @@ -674,6 +679,14 @@ impl DataFrame { #[inline] /// Extend the columns without checking for name collisions or height. + /// + /// # Safety + /// + /// The caller needs to ensure that: + /// - Column names are unique within the resulting [`DataFrame`]. + /// - The length of each appended column matches the height of the [`DataFrame`]. For + /// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards + /// with [`DataFrame::set_height`]. pub unsafe fn column_extend_unchecked(&mut self, iter: impl Iterator) { unsafe { self.get_columns_mut() }.extend(iter) } @@ -1762,14 +1775,13 @@ impl DataFrame { // Otherwise, count the number of values that would be filtered and return that height. let num_trues = mask.num_trues(); - let num_filtered = if mask.len() == self.height() { + if mask.len() == self.height() { num_trues } else { + // This is for broadcasting masks debug_assert!(num_trues == 0 || num_trues == 1); self.height() * num_trues - }; - - num_filtered + } } /// Take the [`DataFrame`] rows by a boolean mask. @@ -3284,27 +3296,29 @@ impl<'a> Iterator for RecordBatchIter<'a> { fn next(&mut self) -> Option { if self.idx >= self.n_chunks { - None + return None; + } + + // Create a batch of the columns with the same chunk no. + let batch_cols: Vec = if self.parallel { + let iter = self + .columns + .par_iter() + .map(Column::as_materialized_series) + .map(|s| s.to_arrow(self.idx, self.compat_level)); + POOL.install(|| iter.collect()) } else { - // Create a batch of the columns with the same chunk no. - let batch_cols = if self.parallel { - let iter = self - .columns - .par_iter() - .map(Column::as_materialized_series) - .map(|s| s.to_arrow(self.idx, self.compat_level)); - POOL.install(|| iter.collect()) - } else { - self.columns - .iter() - .map(Column::as_materialized_series) - .map(|s| s.to_arrow(self.idx, self.compat_level)) - .collect() - }; - self.idx += 1; + self.columns + .iter() + .map(Column::as_materialized_series) + .map(|s| s.to_arrow(self.idx, self.compat_level)) + .collect() + }; + self.idx += 1; - Some(RecordBatch::new(batch_cols)) - } + let length = batch_cols.first().map_or(0, |arr| arr.len()); + + Some(RecordBatch::new(length, batch_cols)) } fn size_hint(&self) -> (usize, Option) { @@ -3325,7 +3339,10 @@ impl Iterator for PhysRecordBatchIter<'_> { .iter_mut() .map(|phys_iter| phys_iter.next().cloned()) .collect::>>() - .map(RecordBatch::new) + .map(|arrs| { + let length = arrs.first().map_or(0, |arr| arr.len()); + RecordBatch::new(length, arrs) + }) } fn size_hint(&self) -> (usize, Option) { diff --git a/crates/polars-core/src/series/into.rs b/crates/polars-core/src/series/into.rs index 914450034cf4..c7f86ff85df1 100644 --- a/crates/polars-core/src/series/into.rs +++ b/crates/polars-core/src/series/into.rs @@ -46,7 +46,7 @@ impl Series { .collect::>(); StructArray::new( dt.to_arrow(compat_level), - ca.len(), + arr.len(), values, arr.validity().cloned(), ) diff --git a/crates/polars-io/src/ipc/mmap.rs b/crates/polars-io/src/ipc/mmap.rs index f0343642482e..b8749c5a7392 100644 --- a/crates/polars-io/src/ipc/mmap.rs +++ b/crates/polars-io/src/ipc/mmap.rs @@ -96,9 +96,10 @@ impl ArrowReader for MMapChunkIter<'_> { let chunk = match &self.projection { None => chunk, Some(proj) => { + let length = chunk.len(); let cols = chunk.into_arrays(); let arrays = proj.iter().map(|i| cols[*i].clone()).collect(); - RecordBatch::new(arrays) + RecordBatch::new(length, arrays) }, }; Ok(Some(chunk)) diff --git a/crates/polars-python/src/dataframe/export.rs b/crates/polars-python/src/dataframe/export.rs index 29ba4edb5371..a7e9394ab4fd 100644 --- a/crates/polars-python/src/dataframe/export.rs +++ b/crates/polars-python/src/dataframe/export.rs @@ -113,6 +113,7 @@ impl PyDataFrame { .df .iter_chunks(CompatLevel::oldest(), true) .map(|rb| { + let length = rb.len(); let mut rb = rb.into_arrays(); for i in &cat_columns { let arr = rb.get_mut(*i).unwrap(); @@ -128,7 +129,7 @@ impl PyDataFrame { .unwrap(); *arr = out; } - let rb = RecordBatch::new(rb); + let rb = RecordBatch::new(length, rb); interop::arrow::to_py::to_py_rb(&rb, &names, py, &pyarrow) }) diff --git a/crates/polars-row/src/decode.rs b/crates/polars-row/src/decode.rs index 9dcac4a01711..c60e614350cd 100644 --- a/crates/polars-row/src/decode.rs +++ b/crates/polars-row/src/decode.rs @@ -64,7 +64,6 @@ unsafe fn decode(rows: &mut [&[u8]], field: &EncodingField, dtype: &ArrowDataTyp .iter() .map(|struct_fld| decode(rows, field, struct_fld.dtype())) .collect(); - dbg!("TODO: VERIFY"); StructArray::new(dtype.clone(), rows.len(), values, None).to_boxed() }, dt => { diff --git a/crates/polars/tests/it/arrow/io/ipc/mod.rs b/crates/polars/tests/it/arrow/io/ipc/mod.rs index 3dfd0aedd276..8004f2fc8eea 100644 --- a/crates/polars/tests/it/arrow/io/ipc/mod.rs +++ b/crates/polars/tests/it/arrow/io/ipc/mod.rs @@ -62,7 +62,7 @@ fn prep_schema(array: &dyn Array) -> ArrowSchemaRef { fn write_boolean() -> PolarsResult<()> { let array = BooleanArray::from([Some(true), Some(false), None, Some(true)]).boxed(); let schema = prep_schema(array.as_ref()); - let columns = RecordBatchT::try_new(vec![array])?; + let columns = RecordBatchT::try_new(4, vec![array])?; round_trip(columns, schema, None, Some(Compression::ZSTD)) } @@ -72,7 +72,7 @@ fn write_sliced_utf8() -> PolarsResult<()> { .sliced(1, 1) .boxed(); let schema = prep_schema(array.as_ref()); - let columns = RecordBatchT::try_new(vec![array])?; + let columns = RecordBatchT::try_new(array.len(), vec![array])?; round_trip(columns, schema, None, Some(Compression::ZSTD)) } @@ -80,6 +80,6 @@ fn write_sliced_utf8() -> PolarsResult<()> { fn write_binview() -> PolarsResult<()> { let array = Utf8ViewArray::from_slice([Some("foo"), Some("bar"), None, Some("hamlet")]).boxed(); let schema = prep_schema(array.as_ref()); - let columns = RecordBatchT::try_new(vec![array])?; + let columns = RecordBatchT::try_new(array.len(), vec![array])?; round_trip(columns, schema, None, Some(Compression::ZSTD)) } diff --git a/crates/polars/tests/it/io/avro/read.rs b/crates/polars/tests/it/io/avro/read.rs index d03503af2049..57adac991d1a 100644 --- a/crates/polars/tests/it/io/avro/read.rs +++ b/crates/polars/tests/it/io/avro/read.rs @@ -124,7 +124,7 @@ pub(super) fn data() -> RecordBatchT> { .boxed(), ]; - RecordBatchT::try_new(columns).unwrap() + RecordBatchT::try_new(2, columns).unwrap() } pub(super) fn write_avro(codec: Codec) -> Result, apache_avro::Error> { @@ -258,6 +258,7 @@ fn test_projected() -> PolarsResult<()> { let mut projection = vec![false; expected_schema.len()]; projection[i] = true; + let length = expected.first().map_or(0, |arr| arr.len()); let expected = expected .clone() .into_arrays() @@ -265,7 +266,7 @@ fn test_projected() -> PolarsResult<()> { .zip(projection.iter()) .filter_map(|x| if *x.1 { Some(x.0) } else { None }) .collect(); - let expected = RecordBatchT::new(expected); + let expected = RecordBatchT::new(length, expected); let expected_schema = expected_schema .clone() @@ -326,9 +327,10 @@ pub(super) fn data_list() -> RecordBatchT> { ); array.try_extend(data).unwrap(); + let length = array.len(); let columns = vec![array.into_box()]; - RecordBatchT::try_new(columns).unwrap() + RecordBatchT::try_new(length, columns).unwrap() } pub(super) fn write_list(codec: Codec) -> Result, apache_avro::Error> { diff --git a/crates/polars/tests/it/io/avro/write.rs b/crates/polars/tests/it/io/avro/write.rs index 8aa1d5d8374b..95905aa54ff3 100644 --- a/crates/polars/tests/it/io/avro/write.rs +++ b/crates/polars/tests/it/io/avro/write.rs @@ -102,7 +102,7 @@ pub(super) fn data() -> RecordBatchT> { )), ]; - RecordBatchT::new(columns) + RecordBatchT::new(2, columns) } pub(super) fn serialize_to_block>( @@ -197,7 +197,7 @@ fn large_format_data() -> RecordBatchT> { Box::new(BinaryArray::::from_slice([b"foo", b"bar"])), Box::new(BinaryArray::::from([Some(b"foo"), None])), ]; - RecordBatchT::new(columns) + RecordBatchT::new(2, columns) } fn large_format_expected_schema() -> ArrowSchema { @@ -216,7 +216,7 @@ fn large_format_expected_data() -> RecordBatchT> { Box::new(BinaryArray::::from_slice([b"foo", b"bar"])), Box::new(BinaryArray::::from([Some(b"foo"), None])), ]; - RecordBatchT::new(columns) + RecordBatchT::new(2, columns) } #[test] @@ -265,7 +265,7 @@ fn struct_data() -> RecordBatchT> { Field::new("item2".into(), ArrowDataType::Int32, true), ]); - RecordBatchT::new(vec![ + RecordBatchT::new(2, vec![ Box::new(StructArray::new( struct_dt.clone(), 2, diff --git a/crates/polars/tests/it/io/parquet/arrow/mod.rs b/crates/polars/tests/it/io/parquet/arrow/mod.rs index fd01930944fc..5d259b595d9e 100644 --- a/crates/polars/tests/it/io/parquet/arrow/mod.rs +++ b/crates/polars/tests/it/io/parquet/arrow/mod.rs @@ -1421,7 +1421,7 @@ fn generic_data() -> PolarsResult<(ArrowSchema, RecordBatchT>)> { Field::new("a12".into(), array12.dtype().clone(), true), Field::new("a13".into(), array13.dtype().clone(), true), ]); - let chunk = RecordBatchT::try_new(vec![ + let chunk = RecordBatchT::try_new(array1.len(), vec![ array1.boxed(), array2.boxed(), array3.boxed(), @@ -1449,12 +1449,13 @@ fn assert_roundtrip( let (new_schema, new_chunks) = integration_read(&r, limit)?; let expected = if let Some(limit) = limit { + let length = chunk.len().min(limit); let expected = chunk .into_arrays() .into_iter() .map(|x| x.sliced(0, limit)) .collect::>(); - RecordBatchT::new(expected) + RecordBatchT::new(length, expected) } else { chunk }; @@ -1515,7 +1516,7 @@ fn assert_array_roundtrip( ) -> PolarsResult<()> { let schema = ArrowSchema::from_iter([Field::new("a1".into(), array.dtype().clone(), is_nullable)]); - let chunk = RecordBatchT::try_new(vec![array])?; + let chunk = RecordBatchT::try_new(array.len(), vec![array])?; assert_roundtrip(schema, chunk, limit) } @@ -1644,7 +1645,7 @@ fn nested_dict_data( )?; let schema = ArrowSchema::from_iter([Field::new("c1".into(), values.dtype().clone(), true)]); - let chunk = RecordBatchT::try_new(vec![values.boxed()])?; + let chunk = RecordBatchT::try_new(values.len(), vec![values.boxed()])?; Ok((schema, chunk)) } @@ -1672,8 +1673,8 @@ fn nested_dict_limit() -> PolarsResult<()> { #[test] fn filter_chunk() -> PolarsResult<()> { - let chunk1 = RecordBatchT::new(vec![PrimitiveArray::from_slice([1i16, 3]).boxed()]); - let chunk2 = RecordBatchT::new(vec![PrimitiveArray::from_slice([2i16, 4]).boxed()]); + let chunk1 = RecordBatchT::new(2, vec![PrimitiveArray::from_slice([1i16, 3]).boxed()]); + let chunk2 = RecordBatchT::new(2, vec![PrimitiveArray::from_slice([2i16, 4]).boxed()]); let schema = ArrowSchema::from_iter([Field::new("c1".into(), ArrowDataType::Int16, true)]); let r = integration_write(&schema, &[chunk1.clone(), chunk2.clone()])?; diff --git a/crates/polars/tests/it/io/parquet/arrow/write.rs b/crates/polars/tests/it/io/parquet/arrow/write.rs index 8863c068baff..9619a083ddcb 100644 --- a/crates/polars/tests/it/io/parquet/arrow/write.rs +++ b/crates/polars/tests/it/io/parquet/arrow/write.rs @@ -50,7 +50,7 @@ fn round_trip_opt_stats( data_page_size: None, }; - let iter = vec![RecordBatchT::try_new(vec![array.clone()])]; + let iter = vec![RecordBatchT::try_new(array.len(), vec![array.clone()])]; let row_groups = RowGroupIterator::try_new(iter.into_iter(), &schema, options, vec![encodings])?; diff --git a/crates/polars/tests/it/io/parquet/read/row_group.rs b/crates/polars/tests/it/io/parquet/read/row_group.rs index 80478a0da958..8008c594ed19 100644 --- a/crates/polars/tests/it/io/parquet/read/row_group.rs +++ b/crates/polars/tests/it/io/parquet/read/row_group.rs @@ -52,7 +52,8 @@ impl Iterator for RowGroupDeserializer { if self.remaining_rows == 0 { return None; } - let chunk = RecordBatchT::try_new(std::mem::take(&mut self.column_chunks)); + let length = self.column_chunks.first().map_or(0, |chunk| chunk.len()); + let chunk = RecordBatchT::try_new(length, std::mem::take(&mut self.column_chunks)); self.remaining_rows = self.remaining_rows.saturating_sub( chunk .as_ref() diff --git a/crates/polars/tests/it/io/parquet/roundtrip.rs b/crates/polars/tests/it/io/parquet/roundtrip.rs index 6e105002fa53..d20551432ec0 100644 --- a/crates/polars/tests/it/io/parquet/roundtrip.rs +++ b/crates/polars/tests/it/io/parquet/roundtrip.rs @@ -28,7 +28,7 @@ fn round_trip( data_page_size: None, }; - let iter = vec![RecordBatchT::try_new(vec![array.clone()])]; + let iter = vec![RecordBatchT::try_new(array.len(), vec![array.clone()])]; let row_groups = RowGroupIterator::try_new(iter.into_iter(), &schema, options, vec![encodings])?; diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 3371db3bae66..9000af70f4ab 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -7,6 +7,7 @@ import pandas as pd import pyarrow as pa import pytest +import io import polars as pl import polars.selectors as cs @@ -1074,3 +1075,63 @@ def test_zfs_equality(size: int) -> None: a.to_frame(), b.to_frame(), ) + + +def test_zfs_nullable_when_otherwise() -> None: + a = pl.Series("a", [{}, None, {}, {}, None], pl.Struct([])) + b = pl.Series("b", [None, {}, None, {}, None], pl.Struct([])) + + df = pl.DataFrame([a, b]) + + df = df.select( + x = pl.when(pl.col.a.is_not_null()).then(pl.col.a).otherwise(pl.col.b), + y = pl.when(pl.col.a.is_null()).then(pl.col.a).otherwise(pl.col.b), + ) + + assert_series_equal(df['x'], pl.Series('x', [{}, {}, {}, {}, None], pl.Struct([]))) + assert_series_equal(df['y'], pl.Series('y', [None, None, None, {}, None], pl.Struct([]))) + + +def test_zfs_struct_fns() -> None: + a = pl.Series("a", [{}], pl.Struct([])) + + assert a.struct.fields == [] + + # @TODO: This should really throw an error as per #19132 + assert a.struct.rename_fields(['a']).struct.unnest().shape == (1, 0) + assert a.struct.rename_fields([]).struct.unnest().shape == (1, 0) + + assert_series_equal(a.struct.json_encode(), pl.Series('a', ["{}"], pl.String)) + + +@pytest.mark.parametrize("format", ['binary', 'json']) +@pytest.mark.parametrize("size", [0, 1, 2, 13]) +def test_zfs_serialization_roundtrip(format: pl.SerializationFormat, size: int) -> None: + a = pl.Series("a", [{}] * size, pl.Struct([])).to_frame() + + f = io.BytesIO() + a.serialize(f, format=format) + + f.seek(0) + assert_frame_equal( + a, + pl.DataFrame.deserialize(f, format=format), + ) + + +@pytest.mark.parametrize("size", [0, 1, 2, 13]) +def test_zfs_row_encoding(size: int) -> None: + a = pl.Series("a", [{}] * size, pl.Struct([])) + + df = pl.DataFrame([a, pl.Series("x", list(range(size)), pl.Int8)]) + + gb = ( + df + .lazy() + .group_by(["a", "x"]) + .agg(pl.all().min()) + .collect(streaming=True) + ) + + # We need to ignore the order because the group_by is undeterministic + assert_frame_equal(gb, df, check_row_order=False) diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index 37a583d929df..903e3cc161bc 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -2,7 +2,7 @@ import io from decimal import Decimal -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Dict import pandas as pd import pytest @@ -354,3 +354,53 @@ def test_ipc_variadic_buffers_categorical_binview_18636() -> None: df.write_ipc(b) b.seek(0) assert_frame_equal(pl.read_ipc(b), df) + + +@pytest.mark.parametrize("size", [0, 1, 2, 13]) +def test_ipc_chunked_roundtrip(size: int) -> None: + a = pl.Series("a", [{ 'x': 1 }] * size, pl.Struct({ 'x': pl.Int8 })).to_frame() + + c = pl.concat([a] * 2, how='vertical') + + f = io.BytesIO() + c.write_ipc(f) + + f.seek(0) + assert_frame_equal(c, pl.read_ipc(f)) + + +@pytest.mark.parametrize("size", [0, 1, 2, 13]) +def test_zfs_ipc_roundtrip(size: int) -> None: + a = pl.Series("a", [{}] * size, pl.Struct([])).to_frame() + + f = io.BytesIO() + a.write_ipc(f) + + f.seek(0) + assert_frame_equal(a, pl.read_ipc(f)) + + +@pytest.mark.parametrize("size", [0, 1, 2, 13]) +def test_zfs_ipc_chunked_roundtrip(size: int) -> None: + a = pl.Series("a", [{}] * size, pl.Struct([])).to_frame() + + c = pl.concat([a] * 2, how='vertical') + + f = io.BytesIO() + c.write_ipc(f) + + f.seek(0) + assert_frame_equal(c, pl.read_ipc(f)) + + +@pytest.mark.parametrize("size", [0, 1, 2, 13]) +@pytest.mark.parametrize("value", [{}, { 'x': 1 }]) +@pytest.mark.write_disk +def test_memmap_ipc_chunked_structs(size: int, value: Dict[str, int], tmp_path: Path) -> None: + a = pl.Series("a", [value] * size, pl.Struct).to_frame() + + c = pl.concat([a] * 2, how='vertical') + + f = tmp_path / 'f.ipc' + c.write_ipc(f) + assert_frame_equal(c, pl.read_ipc(f)) diff --git a/py-polars/tests/unit/io/test_json.py b/py-polars/tests/unit/io/test_json.py index f76bec9af2d2..1e259e11daec 100644 --- a/py-polars/tests/unit/io/test_json.py +++ b/py-polars/tests/unit/io/test_json.py @@ -436,3 +436,15 @@ def test_json_infer_3_dtypes() -> None: out = df.select(pl.col("a").str.json_decode()) assert out["a"].to_list() == [None, ["1"], ["1", "2"]] assert out.dtypes[0] == pl.List(pl.String) + + +#NOTE: This doesn't work for 0, but that is normal +@pytest.mark.parametrize("size", [1, 2, 13]) +def test_zfs_json_roundtrip(size: int) -> None: + a = pl.Series("a", [{}] * size, pl.Struct([])).to_frame() + + f = io.StringIO() + a.write_json(f) + + f.seek(0) + assert_frame_equal(a, pl.read_json(f)) From 3beca20d7c646747466746f58cbdcf13d4176b7b Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 14:02:59 +0200 Subject: [PATCH 05/17] fix avro and growable --- .../src/array/growable/structure.rs | 1 + .../src/io/avro/read/deserialize.rs | 2 +- .../polars-arrow/src/io/avro/read/nested.rs | 6 ++--- py-polars/tests/unit/datatypes/test_struct.py | 26 ++++++++----------- py-polars/tests/unit/io/test_ipc.py | 18 +++++++------ py-polars/tests/unit/io/test_json.py | 2 +- 6 files changed, 27 insertions(+), 28 deletions(-) diff --git a/crates/polars-arrow/src/array/growable/structure.rs b/crates/polars-arrow/src/array/growable/structure.rs index 6c076864f585..79f922d318fa 100644 --- a/crates/polars-arrow/src/array/growable/structure.rs +++ b/crates/polars-arrow/src/array/growable/structure.rs @@ -102,6 +102,7 @@ impl<'a> Growable<'a> for GrowableStruct<'a> { if let Some(validity) = &mut self.validity { validity.extend_constant(additional, false); } + self.length += additional; } #[inline] diff --git a/crates/polars-arrow/src/io/avro/read/deserialize.rs b/crates/polars-arrow/src/io/avro/read/deserialize.rs index e21d67598e0c..dc4c04ed15ea 100644 --- a/crates/polars-arrow/src/io/avro/read/deserialize.rs +++ b/crates/polars-arrow/src/io/avro/read/deserialize.rs @@ -56,7 +56,7 @@ fn make_mutable( .iter() .map(|field| make_mutable(field.dtype(), None, capacity)) .collect::>>()?; - Box::new(DynMutableStructArray::new(values, 0, dtype.clone())) + Box::new(DynMutableStructArray::new(values, dtype.clone())) as Box }, other => { diff --git a/crates/polars-arrow/src/io/avro/read/nested.rs b/crates/polars-arrow/src/io/avro/read/nested.rs index 17baf2ef05e7..e197b827d732 100644 --- a/crates/polars-arrow/src/io/avro/read/nested.rs +++ b/crates/polars-arrow/src/io/avro/read/nested.rs @@ -217,10 +217,10 @@ pub struct DynMutableStructArray { } impl DynMutableStructArray { - pub fn new(values: Vec>, length: usize, dtype: ArrowDataType) -> Self { + pub fn new(values: Vec>, dtype: ArrowDataType) -> Self { Self { dtype, - length, + length: 0, values, validity: None, } @@ -243,11 +243,11 @@ impl DynMutableStructArray { #[inline] fn push_null(&mut self) { self.values.iter_mut().for_each(|x| x.push_null()); + self.length += 1; match &mut self.validity { Some(validity) => validity.push(false), None => self.init_validity(), } - self.length += 1; } fn init_validity(&mut self) { diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 9000af70f4ab..fd8d6dd0c0e5 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -1,5 +1,6 @@ from __future__ import annotations +import io from dataclasses import dataclass from datetime import datetime, time from typing import TYPE_CHECKING @@ -7,7 +8,6 @@ import pandas as pd import pyarrow as pa import pytest -import io import polars as pl import polars.selectors as cs @@ -1084,12 +1084,14 @@ def test_zfs_nullable_when_otherwise() -> None: df = pl.DataFrame([a, b]) df = df.select( - x = pl.when(pl.col.a.is_not_null()).then(pl.col.a).otherwise(pl.col.b), - y = pl.when(pl.col.a.is_null()).then(pl.col.a).otherwise(pl.col.b), + x=pl.when(pl.col.a.is_not_null()).then(pl.col.a).otherwise(pl.col.b), + y=pl.when(pl.col.a.is_null()).then(pl.col.a).otherwise(pl.col.b), ) - assert_series_equal(df['x'], pl.Series('x', [{}, {}, {}, {}, None], pl.Struct([]))) - assert_series_equal(df['y'], pl.Series('y', [None, None, None, {}, None], pl.Struct([]))) + assert_series_equal(df["x"], pl.Series("x", [{}, {}, {}, {}, None], pl.Struct([]))) + assert_series_equal( + df["y"], pl.Series("y", [None, None, None, {}, None], pl.Struct([])) + ) def test_zfs_struct_fns() -> None: @@ -1098,13 +1100,13 @@ def test_zfs_struct_fns() -> None: assert a.struct.fields == [] # @TODO: This should really throw an error as per #19132 - assert a.struct.rename_fields(['a']).struct.unnest().shape == (1, 0) + assert a.struct.rename_fields(["a"]).struct.unnest().shape == (1, 0) assert a.struct.rename_fields([]).struct.unnest().shape == (1, 0) - assert_series_equal(a.struct.json_encode(), pl.Series('a', ["{}"], pl.String)) + assert_series_equal(a.struct.json_encode(), pl.Series("a", ["{}"], pl.String)) -@pytest.mark.parametrize("format", ['binary', 'json']) +@pytest.mark.parametrize("format", ["binary", "json"]) @pytest.mark.parametrize("size", [0, 1, 2, 13]) def test_zfs_serialization_roundtrip(format: pl.SerializationFormat, size: int) -> None: a = pl.Series("a", [{}] * size, pl.Struct([])).to_frame() @@ -1125,13 +1127,7 @@ def test_zfs_row_encoding(size: int) -> None: df = pl.DataFrame([a, pl.Series("x", list(range(size)), pl.Int8)]) - gb = ( - df - .lazy() - .group_by(["a", "x"]) - .agg(pl.all().min()) - .collect(streaming=True) - ) + gb = df.lazy().group_by(["a", "x"]).agg(pl.all().min()).collect(streaming=True) # We need to ignore the order because the group_by is undeterministic assert_frame_equal(gb, df, check_row_order=False) diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index 903e3cc161bc..84e6436cb10e 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -2,7 +2,7 @@ import io from decimal import Decimal -from typing import TYPE_CHECKING, Any, Dict +from typing import TYPE_CHECKING, Any import pandas as pd import pytest @@ -358,9 +358,9 @@ def test_ipc_variadic_buffers_categorical_binview_18636() -> None: @pytest.mark.parametrize("size", [0, 1, 2, 13]) def test_ipc_chunked_roundtrip(size: int) -> None: - a = pl.Series("a", [{ 'x': 1 }] * size, pl.Struct({ 'x': pl.Int8 })).to_frame() + a = pl.Series("a", [{"x": 1}] * size, pl.Struct({"x": pl.Int8})).to_frame() - c = pl.concat([a] * 2, how='vertical') + c = pl.concat([a] * 2, how="vertical") f = io.BytesIO() c.write_ipc(f) @@ -384,7 +384,7 @@ def test_zfs_ipc_roundtrip(size: int) -> None: def test_zfs_ipc_chunked_roundtrip(size: int) -> None: a = pl.Series("a", [{}] * size, pl.Struct([])).to_frame() - c = pl.concat([a] * 2, how='vertical') + c = pl.concat([a] * 2, how="vertical") f = io.BytesIO() c.write_ipc(f) @@ -394,13 +394,15 @@ def test_zfs_ipc_chunked_roundtrip(size: int) -> None: @pytest.mark.parametrize("size", [0, 1, 2, 13]) -@pytest.mark.parametrize("value", [{}, { 'x': 1 }]) +@pytest.mark.parametrize("value", [{}, {"x": 1}]) @pytest.mark.write_disk -def test_memmap_ipc_chunked_structs(size: int, value: Dict[str, int], tmp_path: Path) -> None: +def test_memmap_ipc_chunked_structs( + size: int, value: dict[str, int], tmp_path: Path +) -> None: a = pl.Series("a", [value] * size, pl.Struct).to_frame() - c = pl.concat([a] * 2, how='vertical') + c = pl.concat([a] * 2, how="vertical") - f = tmp_path / 'f.ipc' + f = tmp_path / "f.ipc" c.write_ipc(f) assert_frame_equal(c, pl.read_ipc(f)) diff --git a/py-polars/tests/unit/io/test_json.py b/py-polars/tests/unit/io/test_json.py index 1e259e11daec..a4afd57e73e4 100644 --- a/py-polars/tests/unit/io/test_json.py +++ b/py-polars/tests/unit/io/test_json.py @@ -438,7 +438,7 @@ def test_json_infer_3_dtypes() -> None: assert out.dtypes[0] == pl.List(pl.String) -#NOTE: This doesn't work for 0, but that is normal +# NOTE: This doesn't work for 0, but that is normal @pytest.mark.parametrize("size", [1, 2, 13]) def test_zfs_json_roundtrip(size: int) -> None: a = pl.Series("a", [{}] * size, pl.Struct([])).to_frame() From 11dbc10daef9d8720f7a20b7295ba3f2608abc83 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 14:38:03 +0200 Subject: [PATCH 06/17] last changes --- .../src/io/avro/read/deserialize.rs | 3 +- crates/polars-core/src/serde/series.rs | 6 ++- crates/polars-expr/src/expressions/window.rs | 7 ++- .../polars-ops/src/series/ops/to_dummies.rs | 6 +-- .../polars-pipe/src/executors/sources/csv.rs | 2 +- .../src/plans/functions/merge_sorted.rs | 6 +-- crates/polars/tests/it/io/avro/write.rs | 43 ++++++++++--------- .../polars/tests/it/io/parquet/arrow/mod.rs | 31 +++++++------ .../tests/unit/dataframe/test_null_count.py | 9 ++-- 9 files changed, 58 insertions(+), 55 deletions(-) diff --git a/crates/polars-arrow/src/io/avro/read/deserialize.rs b/crates/polars-arrow/src/io/avro/read/deserialize.rs index dc4c04ed15ea..7c626f2b74ee 100644 --- a/crates/polars-arrow/src/io/avro/read/deserialize.rs +++ b/crates/polars-arrow/src/io/avro/read/deserialize.rs @@ -56,8 +56,7 @@ fn make_mutable( .iter() .map(|field| make_mutable(field.dtype(), None, capacity)) .collect::>>()?; - Box::new(DynMutableStructArray::new(values, dtype.clone())) - as Box + Box::new(DynMutableStructArray::new(values, dtype.clone())) as Box }, other => { polars_bail!(nyi = "Deserializing type {other:#?} is still not implemented") diff --git a/crates/polars-core/src/serde/series.rs b/crates/polars-core/src/serde/series.rs index 3b2415d4d5c2..0fb9d9f05f18 100644 --- a/crates/polars-core/src/serde/series.rs +++ b/crates/polars-core/src/serde/series.rs @@ -290,7 +290,11 @@ impl<'de> Deserialize<'de> for Series { for (f, v) in fields.iter().zip(values.iter()) { if f.dtype() != v.dtype() { - let err = format!("type mismatch for struct. expected: {}, given: {}", f.dtype(), v.dtype()); + let err = format!( + "type mismatch for struct. expected: {}, given: {}", + f.dtype(), + v.dtype() + ); return Err(de::Error::custom(err)); } } diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs index 46b20c9a34b3..d03467d01da9 100644 --- a/crates/polars-expr/src/expressions/window.rs +++ b/crates/polars-expr/src/expressions/window.rs @@ -588,8 +588,11 @@ impl PhysicalExpr for WindowExpr { .1, ) } else { - let df_right = unsafe { DataFrame::new_no_checks_height_from_first(keys) }; - let df_left = unsafe { DataFrame::new_no_checks_height_from_first(group_by_columns) }; + let df_right = + unsafe { DataFrame::new_no_checks_height_from_first(keys) }; + let df_left = unsafe { + DataFrame::new_no_checks_height_from_first(group_by_columns) + }; Ok(private_left_join_multiple_keys(&df_left, &df_right, true)?.1) } }; diff --git a/crates/polars-ops/src/series/ops/to_dummies.rs b/crates/polars-ops/src/series/ops/to_dummies.rs index ab5015f97864..dfe3ba1a3ddf 100644 --- a/crates/polars-ops/src/series/ops/to_dummies.rs +++ b/crates/polars-ops/src/series/ops/to_dummies.rs @@ -46,11 +46,7 @@ impl ToDummies for Series { }) .collect::>(); - Ok(unsafe { - DataFrame::new_no_checks_height_from_first( - sort_columns(columns), - ) - }) + Ok(unsafe { DataFrame::new_no_checks_height_from_first(sort_columns(columns)) }) } } diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index cdb80970e253..38484f9c7255 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -218,7 +218,7 @@ impl Source for CsvSource { for data_chunk in &mut out { // The batched reader creates the column containing all nulls because the schema it // gets passed contains the column. - // + // // SAFETY: Columns are only replaced with columns // 1. of the same name, and // 2. of the same length. diff --git a/crates/polars-plan/src/plans/functions/merge_sorted.rs b/crates/polars-plan/src/plans/functions/merge_sorted.rs index de170583e2d9..6397a8374933 100644 --- a/crates/polars-plan/src/plans/functions/merge_sorted.rs +++ b/crates/polars-plan/src/plans/functions/merge_sorted.rs @@ -31,10 +31,8 @@ pub(super) fn merge_sorted(df: &DataFrame, column: &str) -> PolarsResult RecordBatchT> { Field::new("item2".into(), ArrowDataType::Int32, true), ]); - RecordBatchT::new(2, vec![ - Box::new(StructArray::new( - struct_dt.clone(), - 2, - vec![ - Box::new(PrimitiveArray::::from_slice([1, 2])), - Box::new(PrimitiveArray::::from([None, Some(1)])), - ], - None, - )), - Box::new(StructArray::new( - struct_dt, - 2, - vec![ - Box::new(PrimitiveArray::::from_slice([1, 2])), - Box::new(PrimitiveArray::::from([None, Some(1)])), - ], - Some([true, false].into()), - )), - ]) + RecordBatchT::new( + 2, + vec![ + Box::new(StructArray::new( + struct_dt.clone(), + 2, + vec![ + Box::new(PrimitiveArray::::from_slice([1, 2])), + Box::new(PrimitiveArray::::from([None, Some(1)])), + ], + None, + )), + Box::new(StructArray::new( + struct_dt, + 2, + vec![ + Box::new(PrimitiveArray::::from_slice([1, 2])), + Box::new(PrimitiveArray::::from([None, Some(1)])), + ], + Some([true, false].into()), + )), + ], + ) } fn avro_record() -> Record { diff --git a/crates/polars/tests/it/io/parquet/arrow/mod.rs b/crates/polars/tests/it/io/parquet/arrow/mod.rs index 5d259b595d9e..0a573eb4a186 100644 --- a/crates/polars/tests/it/io/parquet/arrow/mod.rs +++ b/crates/polars/tests/it/io/parquet/arrow/mod.rs @@ -1421,20 +1421,23 @@ fn generic_data() -> PolarsResult<(ArrowSchema, RecordBatchT>)> { Field::new("a12".into(), array12.dtype().clone(), true), Field::new("a13".into(), array13.dtype().clone(), true), ]); - let chunk = RecordBatchT::try_new(array1.len(), vec![ - array1.boxed(), - array2.boxed(), - array3.boxed(), - array4.boxed(), - array6.boxed(), - array7.boxed(), - array8.boxed(), - array9.boxed(), - array10.boxed(), - array11.boxed(), - array12.boxed(), - array13.boxed(), - ])?; + let chunk = RecordBatchT::try_new( + array1.len(), + vec![ + array1.boxed(), + array2.boxed(), + array3.boxed(), + array4.boxed(), + array6.boxed(), + array7.boxed(), + array8.boxed(), + array9.boxed(), + array10.boxed(), + array11.boxed(), + array12.boxed(), + array13.boxed(), + ], + )?; Ok((schema, chunk)) } diff --git a/py-polars/tests/unit/dataframe/test_null_count.py b/py-polars/tests/unit/dataframe/test_null_count.py index a9b1141a2a67..507bf0269517 100644 --- a/py-polars/tests/unit/dataframe/test_null_count.py +++ b/py-polars/tests/unit/dataframe/test_null_count.py @@ -23,9 +23,6 @@ def test_null_count(df: pl.DataFrame) -> None: # note: the zero-row and zero-col cases are always passed as explicit examples null_count, ncols = df.null_count(), len(df.columns) - if ncols == 0: - assert null_count.shape == (0, 0) - else: - assert null_count.shape == (1, ncols) - for idx, count in enumerate(null_count.rows()[0]): - assert count == sum(v is None for v in df.to_series(idx).to_list()) + assert null_count.shape == (1, ncols) + for idx, count in enumerate(null_count.rows()[0]): + assert count == sum(v is None for v in df.to_series(idx).to_list()) From b90b5bb7244436fce4ead59e37290a30a51ee19e Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 14:43:24 +0200 Subject: [PATCH 07/17] add missing file --- .../polars-core/src/chunked_array/ops/bits.rs | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 crates/polars-core/src/chunked_array/ops/bits.rs diff --git a/crates/polars-core/src/chunked_array/ops/bits.rs b/crates/polars-core/src/chunked_array/ops/bits.rs new file mode 100644 index 000000000000..178df3a73d09 --- /dev/null +++ b/crates/polars-core/src/chunked_array/ops/bits.rs @@ -0,0 +1,21 @@ +use super::BooleanChunked; + +impl BooleanChunked { + pub fn num_trues(&self) -> usize { + self.downcast_iter() + .map(|arr| match arr.validity() { + None => arr.values().set_bits(), + Some(validity) => arr.values().num_intersections_with(validity), + }) + .sum() + } + + pub fn num_falses(&self) -> usize { + self.downcast_iter() + .map(|arr| match arr.validity() { + None => arr.values().unset_bits(), + Some(validity) => (!arr.values()).num_intersections_with(validity), + }) + .sum() + } +} From fcd7bce02afd2e32e6285ae3ac1735dc233106f9 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 14:43:40 +0200 Subject: [PATCH 08/17] fix typo --- py-polars/tests/unit/datatypes/test_struct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index fd8d6dd0c0e5..e9d8d7f30a8b 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -1129,5 +1129,5 @@ def test_zfs_row_encoding(size: int) -> None: gb = df.lazy().group_by(["a", "x"]).agg(pl.all().min()).collect(streaming=True) - # We need to ignore the order because the group_by is undeterministic + # We need to ignore the order because the group_by is non-deterministic assert_frame_equal(gb, df, check_row_order=False) From c9abf96c2953d18448e637f3a11f9f2611d11d4e Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 15:07:22 +0200 Subject: [PATCH 09/17] remove failing test --- crates/polars-core/src/frame/mod.rs | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 308e5ccda3bb..33e339e01c98 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -3591,25 +3591,4 @@ mod test { assert_eq!(df.get_column_names(), &["a", "b", "c"]); Ok(()) } - - #[test] - fn test_empty_df_hstack() -> PolarsResult<()> { - let mut base = df!( - "a" => [1, 2, 3], - "b" => [1, 2, 3] - )?; - - // has got columns, but no rows - let mut df = base.clear(); - let out = df.with_column(Series::new("c".into(), [1]))?; - assert_eq!(out.shape(), (0, 3)); - assert!(out.iter().all(|s| s.len() == 0)); - - // no columns - base.columns = vec![]; - let out = base.with_column(Series::new("c".into(), [1]))?; - assert_eq!(out.shape(), (1, 1)); - - Ok(()) - } } From 1d41bded98ea763b00a2469a3d50b6e0b61312f1 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 15:28:58 +0200 Subject: [PATCH 10/17] fix slice --- crates/polars-core/src/frame/mod.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 33e339e01c98..60ae1c2c4eda 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -326,6 +326,8 @@ impl DataFrame { .filter(|l| *l != 1) .max() .unwrap_or(1); + + dbg!(broadcast_len); for col in &mut columns { // Length not equal to the broadcast len, needs broadcast or is an error. @@ -343,8 +345,12 @@ impl DataFrame { } } + dbg!(&columns); + let length = if columns.is_empty() { 0 } else { broadcast_len }; + dbg!(length); + Ok(unsafe { DataFrame::new_no_checks(length, columns) }) } @@ -2479,6 +2485,7 @@ impl DataFrame { let height = if let Some(fst) = col.first() { fst.len() } else { + let (_, length) = slice_offsets(offset, length, self.height()); length }; From 520a6b410a380ecff31974ef4c88bea0a65884e0 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 15:29:57 +0200 Subject: [PATCH 11/17] rustfmt --- crates/polars-core/src/frame/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 60ae1c2c4eda..4e4e2292b806 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -326,7 +326,7 @@ impl DataFrame { .filter(|l| *l != 1) .max() .unwrap_or(1); - + dbg!(broadcast_len); for col in &mut columns { From b1a8d9bcb5966499f78650fcbd6c0c66ba80c9b8 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 18:25:19 +0200 Subject: [PATCH 12/17] remove dbg oops --- crates/polars-core/src/frame/mod.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 4e4e2292b806..b82d43365010 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -327,8 +327,6 @@ impl DataFrame { .max() .unwrap_or(1); - dbg!(broadcast_len); - for col in &mut columns { // Length not equal to the broadcast len, needs broadcast or is an error. let len = col.len(); @@ -345,12 +343,8 @@ impl DataFrame { } } - dbg!(&columns); - let length = if columns.is_empty() { 0 } else { broadcast_len }; - dbg!(length); - Ok(unsafe { DataFrame::new_no_checks(length, columns) }) } From 7a7440f9f13a316de8468088125c1b41c993940f Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 18:26:52 +0200 Subject: [PATCH 13/17] fix doc test --- crates/polars-arrow/src/array/struct_/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-arrow/src/array/struct_/mod.rs b/crates/polars-arrow/src/array/struct_/mod.rs index 6eebbc783a13..11d0f2de200f 100644 --- a/crates/polars-arrow/src/array/struct_/mod.rs +++ b/crates/polars-arrow/src/array/struct_/mod.rs @@ -27,7 +27,7 @@ use crate::compute::utils::combine_validities_and; /// Field::new("c".into(), ArrowDataType::Int32, false), /// ]; /// -/// let array = StructArray::new(ArrowDataType::Struct(fields), vec![boolean, int], None); +/// let array = StructArray::new(ArrowDataType::Struct(fields), 4, vec![boolean, int], None); /// ``` #[derive(Clone)] pub struct StructArray { From c6925651e94a3a6a2b05e1e648fc8a68500196e8 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 18:35:36 +0200 Subject: [PATCH 14/17] update MutableStructArray error message --- crates/polars-arrow/src/array/struct_/mutable.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/crates/polars-arrow/src/array/struct_/mutable.rs b/crates/polars-arrow/src/array/struct_/mutable.rs index ab4090053899..e066d7b6aef2 100644 --- a/crates/polars-arrow/src/array/struct_/mutable.rs +++ b/crates/polars-arrow/src/array/struct_/mutable.rs @@ -34,10 +34,7 @@ fn check( .enumerate() .try_for_each(|(index, (dtype, child))| { if dtype != child { - polars_bail!(ComputeError: - "The children DataTypes of a StructArray must equal the children data types. - However, the field {index} has data type {dtype:?} but the value has data type {child:?}" - ) + polars_bail!(ComputeError: "The children DataTypes of a StructArray must equal the children data types.\nHowever, the field {index} has data type {dtype:?} but the value has data type {child:?}") } else { Ok(()) } @@ -49,10 +46,7 @@ fn check( .enumerate() .try_for_each(|(index, f_length)| { if f_length != length { - polars_bail!(ComputeError: - "The children must have an equal number of values. - However, the values at index {index} have a length of {f_length}, which is different from given length {length}." - ) + polars_bail!(ComputeError: "The children must have the given number of values.\nHowever, the values at index {index} have a length of {f_length}, which is different from given length {length}.") } else { Ok(()) } From a9b9303ac2285bbb64246e7ccb8380db07d99997 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 8 Oct 2024 18:41:13 +0200 Subject: [PATCH 15/17] remove unnecessary to_string --- crates/polars-arrow/src/record_batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-arrow/src/record_batch.rs b/crates/polars-arrow/src/record_batch.rs index a0c6bbb6c290..f58d129831f1 100644 --- a/crates/polars-arrow/src/record_batch.rs +++ b/crates/polars-arrow/src/record_batch.rs @@ -33,7 +33,7 @@ impl> RecordBatchT { pub fn try_new(length: usize, arrays: Vec) -> PolarsResult { polars_ensure!( arrays.iter().all(|arr| arr.as_ref().len() == length), - ComputeError: "RecordBatch requires all its arrays to have an equal number of rows".to_string(), + ComputeError: "RecordBatch requires all its arrays to have an equal number of rows", ); Ok(Self { length, arrays }) From 8f65209cdf88be825adcb999cda6e7f1e64e7790 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 9 Oct 2024 08:17:24 +0200 Subject: [PATCH 16/17] add better SAFETY comment for empty struct construction --- crates/polars-core/src/chunked_array/from.rs | 10 ++++++++-- crates/polars-core/src/chunked_array/struct_/mod.rs | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/crates/polars-core/src/chunked_array/from.rs b/crates/polars-core/src/chunked_array/from.rs index 33e984b94e0f..c84dc20a5d63 100644 --- a/crates/polars-core/src/chunked_array/from.rs +++ b/crates/polars-core/src/chunked_array/from.rs @@ -211,6 +211,7 @@ where /// Create a new [`ChunkedArray`] from existing chunks. /// /// # Safety + /// /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`. pub unsafe fn from_chunks_and_dtype( name: PlSmallStr, @@ -225,10 +226,15 @@ where assert_eq!(chunks[0].dtype(), &dtype.to_arrow(CompatLevel::newest())) } } - let field = Arc::new(Field::new(name, dtype)); - ChunkedArray::new_with_compute_len(field, chunks) + + Self::from_chunks_and_dtype_unchecked(name, chunks, dtype) } + /// Create a new [`ChunkedArray`] from existing chunks. + /// + /// # Safety + /// + /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`. pub(crate) unsafe fn from_chunks_and_dtype_unchecked( name: PlSmallStr, chunks: Vec, diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index b098d6e7fe05..af481978f409 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -28,8 +28,8 @@ fn constructor<'a, I: ExactSizeIterator + Clone>( let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest()); let chunks = vec![StructArray::new(arrow_dtype, length, Vec::new(), None).boxed()]; - // SAFETY: invariants checked above. - return Ok(unsafe { StructChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) }); + // SAFETY: We construct each chunk above to have the `Struct` data type. + return Ok(unsafe { StructChunked::from_chunks_and_dtype(name, chunks, dtype) }); } // Different chunk lengths: rechunk and recurse. From d87fbf864f32ac6acde7f3ea308854a19a7f75f8 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 9 Oct 2024 08:20:06 +0200 Subject: [PATCH 17/17] add comment on why we are clearing --- crates/polars-core/src/chunked_array/struct_/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index af481978f409..05ac32424d5e 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -121,7 +121,9 @@ impl StructChunked { } if length == 0 { + // @NOTE: There are columns that are being broadcasted so we need to clear those. let new_fields = fields.map(|s| s.clear()).collect::>(); + return constructor(name, length, new_fields.iter()); }