From 2b082c78db03611d2a5123787d3ea7c9ed58dacc Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 10 Sep 2024 18:44:03 +0200 Subject: [PATCH 01/42] start of the massacre [skip-ci] --- .../logical/categorical/ops/unique.rs | 2 +- .../polars-core/src/chunked_array/ndarray.rs | 2 +- .../polars-core/src/chunked_array/ops/mod.rs | 2 +- .../chunked_array/ops/sort/arg_bottom_k.rs | 2 +- .../ops/sort/arg_sort_multiple.rs | 34 +- .../src/chunked_array/ops/sort/categorical.rs | 6 +- .../src/chunked_array/ops/sort/mod.rs | 30 +- .../polars-core/src/chunked_array/random.rs | 4 +- .../src/chunked_array/struct_/frame.rs | 4 +- .../src/chunked_array/struct_/mod.rs | 13 +- crates/polars-core/src/fmt.rs | 9 +- crates/polars-core/src/frame/arithmetic.rs | 11 +- crates/polars-core/src/frame/chunks.rs | 10 +- crates/polars-core/src/frame/explode.rs | 63 +- crates/polars-core/src/frame/from.rs | 1 + .../src/frame/group_by/into_groups.rs | 2 +- crates/polars-core/src/frame/group_by/mod.rs | 106 +- crates/polars-core/src/frame/horizontal.rs | 17 +- crates/polars-core/src/frame/mod.rs | 969 +++++++++++++++--- crates/polars-core/src/frame/row/dataframe.rs | 28 +- crates/polars-core/src/frame/row/mod.rs | 2 +- crates/polars-core/src/frame/row/transpose.rs | 84 +- crates/polars-core/src/frame/top_k.rs | 2 +- .../polars-core/src/frame/upstream_traits.rs | 8 +- crates/polars-core/src/functions.rs | 2 +- .../polars-core/src/hashing/vector_hasher.rs | 6 +- crates/polars-core/src/prelude.rs | 2 +- crates/polars-core/src/serde/df.rs | 6 +- crates/polars-core/src/serde/mod.rs | 8 +- crates/polars-core/src/series/from.rs | 8 + .../src/series/implementations/binary.rs | 2 +- .../series/implementations/binary_offset.rs | 2 +- .../src/series/implementations/boolean.rs | 2 +- .../src/series/implementations/categorical.rs | 2 +- .../src/series/implementations/date.rs | 2 +- .../src/series/implementations/datetime.rs | 2 +- .../src/series/implementations/duration.rs | 2 +- .../src/series/implementations/floats.rs | 2 +- .../src/series/implementations/mod.rs | 2 +- .../src/series/implementations/string.rs | 2 +- .../src/series/implementations/time.rs | 2 +- crates/polars-core/src/series/mod.rs | 2 +- crates/polars-core/src/series/series_trait.rs | 2 +- crates/polars-core/src/testing.rs | 4 +- crates/polars-core/src/tests.rs | 6 +- crates/polars-core/src/utils/flatten.rs | 2 +- crates/polars-core/src/utils/mod.rs | 6 +- crates/polars-ops/src/chunked_array/top_k.rs | 10 +- .../polars-ops/src/frame/join/asof/groups.rs | 20 +- crates/polars-ops/src/frame/join/general.rs | 4 +- .../polars-ops/src/frame/join/iejoin/mod.rs | 28 +- .../polars-ops/src/frame/join/merge_sorted.rs | 6 +- crates/polars-ops/src/frame/join/mod.rs | 30 +- crates/polars-ops/src/frame/mod.rs | 2 +- crates/polars-ops/src/frame/pivot/mod.rs | 19 +- .../polars-ops/src/frame/pivot/positioning.rs | 60 +- crates/polars-ops/src/frame/pivot/unpivot.rs | 16 +- .../polars-ops/src/series/ops/horizontal.rs | 19 +- crates/polars-ops/src/series/ops/replace.rs | 11 +- .../polars-ops/src/series/ops/to_dummies.rs | 4 +- crates/polars-ops/src/series/ops/various.rs | 10 +- .../src/dsl/function_expr/fill_null.rs | 2 +- 62 files changed, 1237 insertions(+), 491 deletions(-) diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs index a0f4a4ef90db..7b851c5def54 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs @@ -66,7 +66,7 @@ impl CategoricalChunked { let mut counts = groups.group_count(); counts.rename(PlSmallStr::from_static("counts")); - let cols = vec![values.into_series(), counts.into_series()]; + let cols = vec![values.into_series().into(), counts.into_series().into()]; let df = unsafe { DataFrame::new_no_checks(cols) }; df.sort( ["counts"], diff --git a/crates/polars-core/src/chunked_array/ndarray.rs b/crates/polars-core/src/chunked_array/ndarray.rs index 079061e31478..a3017f0103e6 100644 --- a/crates/polars-core/src/chunked_array/ndarray.rs +++ b/crates/polars-core/src/chunked_array/ndarray.rs @@ -108,7 +108,7 @@ impl DataFrame { let columns = self.get_columns(); POOL.install(|| { columns.par_iter().enumerate().try_for_each(|(col_idx, s)| { - let s = s.cast(&N::get_dtype())?; + let s = s.as_materialized_series().cast(&N::get_dtype())?; let s = match s.dtype() { DataType::Float32 => { let ca = s.f32().unwrap(); diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index b252d23814eb..456ac561a3f1 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -384,7 +384,7 @@ pub trait ChunkSort { #[allow(unused_variables)] fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], _options: &SortMultipleOptions, ) -> PolarsResult { polars_bail!(opq = arg_sort_multiple, T::get_dtype()); diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs index e774c8ba51f3..cad95d6b1d10 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs @@ -31,7 +31,7 @@ impl PartialOrd for CompareRow<'_> { /// Similar to .argsort() then .slice(0, k) but with a more efficient implementation. pub fn _arg_bottom_k( k: usize, - by_column: &[Series], + by_column: &[Column], sort_options: &mut SortMultipleOptions, ) -> PolarsResult> { let from_n_rows = by_column[0].len(); diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs index d659ebab7e69..4fc01211f3df 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs @@ -8,7 +8,7 @@ use crate::utils::_split_offsets; pub(crate) fn args_validate( ca: &ChunkedArray, - other: &[Series], + other: &[Column], param_value: &[bool], param_name: &str, ) -> PolarsResult<()> { @@ -25,7 +25,7 @@ pub(crate) fn args_validate( pub(crate) fn arg_sort_multiple_impl( mut vals: Vec<(IdxSize, T)>, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { let nulls_last = &options.nulls_last; @@ -36,7 +36,7 @@ pub(crate) fn arg_sort_multiple_impl( let compare_inner: Vec<_> = by .iter() - .map(|s| s.into_total_ord_inner()) + .map(|s| s.as_materialized_series().into_total_ord_inner()) .collect_trusted(); let first_descending = descending[0]; @@ -106,7 +106,7 @@ pub fn _get_rows_encoded_compat_array(by: &Series) -> PolarsResult { Ok(out) } -pub fn encode_rows_vertical_par_unordered(by: &[Series]) -> PolarsResult { +pub fn encode_rows_vertical_par_unordered(by: &[Column]) -> PolarsResult { let n_threads = POOL.current_num_threads(); let len = by[0].len(); let splits = _split_offsets(len, n_threads); @@ -129,7 +129,7 @@ pub fn encode_rows_vertical_par_unordered(by: &[Series]) -> PolarsResult PolarsResult { let n_threads = POOL.current_num_threads(); let len = by[0].len(); @@ -138,14 +138,15 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls( let chunks = splits.into_par_iter().map(|(offset, len)| { let sliced = by .iter() - .map(|s| s.slice(offset as i64, len)) + .map(|s| s.as_materialized_series().slice(offset as i64, len)) + .map(Column::from) .collect::>(); let rows = _get_rows_encoded_unordered(&sliced)?; let validities = sliced .iter() - .flat_map(|s| { - let s = s.rechunk(); + .flat_map(|c| { + let s = c.as_materialized_series().rechunk(); #[allow(clippy::unnecessary_to_owned)] s.chunks() .to_vec() @@ -165,7 +166,7 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls( )) } -pub(crate) fn encode_rows_unordered(by: &[Series]) -> PolarsResult { +pub(crate) fn encode_rows_unordered(by: &[Column]) -> PolarsResult { let rows = _get_rows_encoded_unordered(by)?; Ok(BinaryOffsetChunked::with_chunk( PlSmallStr::EMPTY, @@ -173,11 +174,11 @@ pub(crate) fn encode_rows_unordered(by: &[Series]) -> PolarsResult PolarsResult { +pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult { let mut cols = Vec::with_capacity(by.len()); let mut fields = Vec::with_capacity(by.len()); for by in by { - let arr = _get_rows_encoded_compat_array(by)?; + let arr = _get_rows_encoded_compat_array(by.as_materialized_series())?; let field = EncodingField::new_unsorted(); match arr.dtype() { // Flatten the struct fields. @@ -198,7 +199,7 @@ pub fn _get_rows_encoded_unordered(by: &[Series]) -> PolarsResult { } pub fn _get_rows_encoded( - by: &[Series], + by: &[Column], descending: &[bool], nulls_last: &[bool], ) -> PolarsResult { @@ -209,6 +210,7 @@ pub fn _get_rows_encoded( let mut fields = Vec::with_capacity(by.len()); for ((by, desc), null_last) in by.iter().zip(descending).zip(nulls_last) { + let by = by.as_materialized_series(); let arr = _get_rows_encoded_compat_array(by)?; let sort_field = EncodingField { descending: *desc, @@ -236,7 +238,7 @@ pub fn _get_rows_encoded( pub fn _get_rows_encoded_ca( name: PlSmallStr, - by: &[Series], + by: &[Column], descending: &[bool], nulls_last: &[bool], ) -> PolarsResult { @@ -245,7 +247,7 @@ pub fn _get_rows_encoded_ca( } pub fn _get_rows_encoded_arr( - by: &[Series], + by: &[Column], descending: &[bool], nulls_last: &[bool], ) -> PolarsResult> { @@ -254,14 +256,14 @@ pub fn _get_rows_encoded_arr( pub fn _get_rows_encoded_ca_unordered( name: PlSmallStr, - by: &[Series], + by: &[Column], ) -> PolarsResult { _get_rows_encoded_unordered(by) .map(|rows| BinaryOffsetChunked::with_chunk(name, rows.into_array())) } pub(crate) fn argsort_multiple_row_fmt( - by: &[Series], + by: &[Column], mut descending: Vec, mut nulls_last: Vec, parallel: bool, diff --git a/crates/polars-core/src/chunked_array/ops/sort/categorical.rs b/crates/polars-core/src/chunked_array/ops/sort/categorical.rs index 0dcb2cb84b51..a984c92147b9 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/categorical.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/categorical.rs @@ -76,7 +76,7 @@ impl CategoricalChunked { pub(crate) fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { if self.uses_lexical_ordering() { @@ -177,7 +177,7 @@ mod test { SortMultipleOptions::default().with_order_descending_multi([false, false]), )?; let out = out.column("cat")?; - let cat = out.categorical()?; + let cat = out.as_materialized_series().categorical()?; assert_order(cat, &["a", "a", "b", "c"]); let out = df.sort( @@ -185,7 +185,7 @@ mod test { SortMultipleOptions::default().with_order_descending_multi([false, false]), )?; let out = out.column("cat")?; - let cat = out.categorical()?; + let cat = out.as_materialized_series().categorical()?; assert_order(cat, &["b", "c", "a", "a"]); } Ok(()) diff --git a/crates/polars-core/src/chunked_array/ops/sort/mod.rs b/crates/polars-core/src/chunked_array/ops/sort/mod.rs index 1c1940b6f10d..cfe30bb59c7d 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/mod.rs @@ -236,7 +236,7 @@ where fn arg_sort_multiple_numeric( ca: &ChunkedArray, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { args_validate(ca, by, &options.descending, "descending")?; @@ -294,7 +294,7 @@ where /// We assume that all numeric `Series` are of the same type, if not it will panic fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { arg_sort_multiple_numeric(self, by, options) @@ -349,7 +349,7 @@ impl ChunkSort for StringChunked { /// fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.as_binary().arg_sort_multiple(by, options) @@ -427,7 +427,7 @@ impl ChunkSort for BinaryChunked { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { args_validate(self, by, &options.descending, "descending")?; @@ -574,7 +574,7 @@ impl ChunkSort for BinaryOffsetChunked { /// uphold this contract. If not, it will panic. fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { args_validate(self, by, &options.descending, "descending")?; @@ -599,7 +599,7 @@ impl StructChunked { pub(crate) fn arg_sort(&self, options: SortOptions) -> IdxCa { let bin = _get_rows_encoded_ca( self.name().clone(), - &[self.clone().into_series()], + &[self.clone().into_column()], &[options.descending], &[options.nulls_last], ) @@ -692,7 +692,7 @@ impl ChunkSort for BooleanChunked { } fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { let mut vals = Vec::with_capacity(self.len()); @@ -754,14 +754,16 @@ pub fn _broadcast_bools(n_cols: usize, values: &mut Vec) { } pub(crate) fn prepare_arg_sort( - columns: Vec, + columns: Vec, sort_options: &mut SortMultipleOptions, -) -> PolarsResult<(Series, Vec)> { +) -> PolarsResult<(Column, Vec)> { let n_cols = columns.len(); let mut columns = columns .iter() + .map(Column::as_materialized_series) .map(convert_sort_column_multi_sort) + .map(|s| s.map(Column::from)) .collect::>>()?; _broadcast_bools(n_cols, &mut sort_options.descending); @@ -881,11 +883,15 @@ mod test { PlSmallStr::from_static("c"), &["a", "b", "c", "d", "e", "f", "g", "h"], ); - let df = DataFrame::new(vec![a.into_series(), b.into_series(), c.into_series()])?; + let df = DataFrame::new(vec![ + a.into_series().into(), + b.into_series().into(), + c.into_series().into(), + ])?; let out = df.sort(["a", "b", "c"], SortMultipleOptions::default())?; assert_eq!( - Vec::from(out.column("b")?.i64()?), + Vec::from(out.column("b")?.as_series().unwrap().i64()?), &[ Some(0), Some(2), @@ -905,7 +911,7 @@ mod test { ) .into_series(); let b = Int32Chunked::new(PlSmallStr::from_static("b"), &[5, 4, 2, 3, 4, 5]).into_series(); - let df = DataFrame::new(vec![a, b])?; + let df = DataFrame::new(vec![a.into(), b.into()])?; let out = df.sort(["a", "b"], SortMultipleOptions::default())?; let expected = df!( diff --git a/crates/polars-core/src/chunked_array/random.rs b/crates/polars-core/src/chunked_array/random.rs index 94ab33f02cee..1ad3d2b7abd7 100644 --- a/crates/polars-core/src/chunked_array/random.rs +++ b/crates/polars-core/src/chunked_array/random.rs @@ -193,7 +193,7 @@ impl DataFrame { match n.get(0) { Some(n) => self.sample_n_literal(n as usize, with_replacement, shuffle, seed), None => { - let new_cols = self.columns.iter().map(Series::clear).collect_trusted(); + let new_cols = self.columns.iter().map(Column::clear).collect_trusted(); Ok(unsafe { DataFrame::new_no_checks(new_cols) }) }, } @@ -238,7 +238,7 @@ impl DataFrame { self.sample_n_literal(n, with_replacement, shuffle, seed) }, None => { - let new_cols = self.columns.iter().map(Series::clear).collect_trusted(); + let new_cols = self.columns.iter().map(Column::clear).collect_trusted(); Ok(unsafe { DataFrame::new_no_checks(new_cols) }) }, } diff --git a/crates/polars-core/src/chunked_array/struct_/frame.rs b/crates/polars-core/src/chunked_array/struct_/frame.rs index 280a9df6da56..92e46ac8635a 100644 --- a/crates/polars-core/src/chunked_array/struct_/frame.rs +++ b/crates/polars-core/src/chunked_array/struct_/frame.rs @@ -5,6 +5,8 @@ use crate::prelude::StructChunked; impl DataFrame { pub fn into_struct(self, name: PlSmallStr) -> StructChunked { - StructChunked::from_series(name, &self.columns).expect("same invariants") + // @scalar-opt + let series = self.materialized_column_iter().cloned().collect::>(); + StructChunked::from_series(name, &series).expect("same invariants") } } diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index 882251a43d6d..95e893bbe73e 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -290,15 +290,15 @@ impl StructChunked { } pub fn get_row_encoded_array(&self, options: SortOptions) -> PolarsResult> { - let s = self.clone().into_series(); - _get_rows_encoded_arr(&[s], &[options.descending], &[options.nulls_last]) + let c = self.clone().into_column(); + _get_rows_encoded_arr(&[c], &[options.descending], &[options.nulls_last]) } pub fn get_row_encoded(&self, options: SortOptions) -> PolarsResult { - let s = self.clone().into_series(); + let c = self.clone().into_column(); _get_rows_encoded_ca( self.name().clone(), - &[s], + &[c], &[options.descending], &[options.nulls_last], ) @@ -346,8 +346,11 @@ impl StructChunked { } pub fn unnest(self) -> DataFrame { + // @scalar-opt + let columns = self.fields_as_series().into_iter().map(Column::from).collect(); + // SAFETY: invariants for struct are the same - unsafe { DataFrame::new_no_checks(self.fields_as_series()) } + unsafe { DataFrame::new_no_checks(columns) } } /// Get access to one of this `[StructChunked]`'s fields diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 00455a1a841a..bcb66b441ccd 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -616,8 +616,7 @@ impl Display for DataFrame { for i in 0..(half + rest) { let row = self - .columns - .iter() + .materialized_column_iter() .map(|s| s.str_value(i).unwrap()) .collect(); @@ -630,8 +629,7 @@ impl Display for DataFrame { rows.push(dots); for i in (height - half)..height { let row = self - .columns - .iter() + .materialized_column_iter() .map(|s| s.str_value(i).unwrap()) .collect(); @@ -644,8 +642,7 @@ impl Display for DataFrame { for i in 0..height { if self.width() > 0 { let row = self - .columns - .iter() + .materialized_column_iter() .map(|s| s.str_value(i).unwrap()) .collect(); diff --git a/crates/polars-core/src/frame/arithmetic.rs b/crates/polars-core/src/frame/arithmetic.rs index 69e2279cd47f..6d184b2960c9 100644 --- a/crates/polars-core/src/frame/arithmetic.rs +++ b/crates/polars-core/src/frame/arithmetic.rs @@ -20,9 +20,9 @@ macro_rules! impl_arithmetic { let rhs = $rhs.cast(&st)?; let cols = POOL.install(|| { $self - .columns - .par_iter() + .par_materialized_column_iter() .map(|s| $operand(&s.cast(&st)?, &rhs)) + .map(|s| s.map(Column::from)) .collect::>() })?; Ok(unsafe { DataFrame::new_no_checks(cols) }) @@ -122,6 +122,9 @@ impl DataFrame { .par_iter() .zip(other.get_columns().par_iter()) .map(|(l, r)| { + let l = l.as_materialized_series(); + let r = r.as_materialized_series(); + let diff_l = max_len - l.len(); let diff_r = max_len - r.len(); @@ -136,7 +139,7 @@ impl DataFrame { r = r.extend_constant(AnyValue::Null, diff_r)?; }; - f(&l, &r) + f(&l, &r).map(Column::from) }); let mut cols = POOL.install(|| cols.collect::>>())?; @@ -152,7 +155,7 @@ impl DataFrame { // trick to fill a series with nulls let vals: &[Option] = &[None]; let s = Series::new(name.clone(), vals).cast(dtype)?; - cols.push(s.new_from_index(0, max_len)) + cols.push(s.new_from_index(0, max_len).into()) } } DataFrame::new(cols) diff --git a/crates/polars-core/src/frame/chunks.rs b/crates/polars-core/src/frame/chunks.rs index 349a77c56d75..3fffbc1ce22f 100644 --- a/crates/polars-core/src/frame/chunks.rs +++ b/crates/polars-core/src/frame/chunks.rs @@ -9,12 +9,12 @@ impl TryFrom<(RecordBatch, &ArrowSchema)> for DataFrame { type Error = PolarsError; fn try_from(arg: (RecordBatch, &ArrowSchema)) -> PolarsResult { - let columns: PolarsResult> = arg + let columns: PolarsResult> = arg .0 .columns() .iter() .zip(arg.1.iter_values()) - .map(|(arr, field)| Series::try_from((field, arr.clone()))) + .map(|(arr, field)| Series::try_from((field, arr.clone())).map(Column::from)) .collect(); DataFrame::new(columns?) @@ -29,7 +29,11 @@ impl DataFrame { let columns = self .get_columns() .iter() - .map(|s| s.select_chunk(i)) + .map(|column| match column { + Column::Series(s) => s.select_chunk(i), + Column::Scalar(s) => s.select_chunk(i), + }) + .map(Column::from) .collect::>(); DataFrame::new_no_checks(columns) diff --git a/crates/polars-core/src/frame/explode.rs b/crates/polars-core/src/frame/explode.rs index 3e597756eb1e..c12086def533 100644 --- a/crates/polars-core/src/frame/explode.rs +++ b/crates/polars-core/src/frame/explode.rs @@ -29,12 +29,12 @@ pub struct UnpivotArgsIR { } impl DataFrame { - pub fn explode_impl(&self, mut columns: Vec) -> PolarsResult { + pub fn explode_impl(&self, mut columns: Vec) -> PolarsResult { polars_ensure!(!columns.is_empty(), InvalidOperation: "no columns provided in explode"); let mut df = self.clone(); if self.is_empty() { for s in &columns { - df.with_column(s.explode()?)?; + df.with_column(s.as_materialized_series().explode()?)?; } return Ok(df); } @@ -57,14 +57,16 @@ impl DataFrame { let exploded_columns = POOL.install(|| { columns .par_iter() + .map(Column::as_materialized_series) .map(get_exploded) + .map(|s| s.map(|(s, o)| (Column::from(s), o))) .collect::>>() })?; fn process_column( original_df: &DataFrame, df: &mut DataFrame, - exploded: Series, + exploded: Column, ) -> PolarsResult<()> { if exploded.len() == df.height() || df.width() == 0 { let col_idx = original_df.check_name_to_idx(exploded.name().as_str())?; @@ -187,7 +189,7 @@ impl DataFrame { { // We need to sort the column by order of original occurrence. Otherwise the insert by index // below will panic - let columns = self.select_series(columns)?; + let columns = self.select_columns(columns)?; self.explode_impl(columns) } } @@ -203,17 +205,41 @@ mod test { let s0 = Series::new(PlSmallStr::from_static("a"), &[1i8, 2, 3]); let s1 = Series::new(PlSmallStr::from_static("b"), &[1i8, 1, 1]); let s2 = Series::new(PlSmallStr::from_static("c"), &[2i8, 2, 2]); - let list = Series::new(PlSmallStr::from_static("foo"), &[s0, s1, s2]); + let list = Column::new(PlSmallStr::from_static("foo"), &[s0, s1, s2]); - let s0 = Series::new(PlSmallStr::from_static("B"), [1, 2, 3]); - let s1 = Series::new(PlSmallStr::from_static("C"), [1, 1, 1]); + let s0 = Column::new(PlSmallStr::from_static("B"), [1, 2, 3]); + let s1 = Column::new(PlSmallStr::from_static("C"), [1, 1, 1]); let df = DataFrame::new(vec![list, s0.clone(), s1.clone()]).unwrap(); let exploded = df.explode(["foo"]).unwrap(); assert_eq!(exploded.shape(), (9, 3)); - assert_eq!(exploded.column("C").unwrap().i32().unwrap().get(8), Some(1)); - assert_eq!(exploded.column("B").unwrap().i32().unwrap().get(8), Some(3)); assert_eq!( - exploded.column("foo").unwrap().i8().unwrap().get(8), + exploded + .column("C") + .unwrap() + .as_materialized_series() + .i32() + .unwrap() + .get(8), + Some(1) + ); + assert_eq!( + exploded + .column("B") + .unwrap() + .as_materialized_series() + .i32() + .unwrap() + .get(8), + Some(3) + ); + assert_eq!( + exploded + .column("foo") + .unwrap() + .as_materialized_series() + .i8() + .unwrap() + .get(8), Some(2) ); } @@ -223,12 +249,12 @@ mod test { fn test_explode_df_empty_list() -> PolarsResult<()> { let s0 = Series::new(PlSmallStr::from_static("a"), &[1, 2, 3]); let s1 = Series::new(PlSmallStr::from_static("b"), &[1, 1, 1]); - let list = Series::new( + let list = Column::new( PlSmallStr::from_static("foo"), &[s0, s1.clone(), s1.clear()], ); - let s0 = Series::new(PlSmallStr::from_static("B"), [1, 2, 3]); - let s1 = Series::new(PlSmallStr::from_static("C"), [1, 1, 1]); + let s0 = Column::new(PlSmallStr::from_static("B"), [1, 2, 3]); + let s1 = Column::new(PlSmallStr::from_static("C"), [1, 1, 1]); let df = DataFrame::new(vec![list, s0.clone(), s1.clone()])?; let out = df.explode(["foo"])?; @@ -240,9 +266,13 @@ mod test { assert!(out.equals_missing(&expected)); - let list = Series::new( + let list = Column::new( PlSmallStr::from_static("foo"), - [s0.clone(), s1.clear(), s1.clone()], + [ + s0.as_materialized_series().clone(), + s1.as_materialized_series().clear(), + s1.as_materialized_series().clone(), + ], ); let df = DataFrame::new(vec![list, s0, s1])?; let out = df.explode(["foo"])?; @@ -261,12 +291,13 @@ mod test { fn test_explode_single_col() -> PolarsResult<()> { let s0 = Series::new(PlSmallStr::from_static("a"), &[1i32, 2, 3]); let s1 = Series::new(PlSmallStr::from_static("b"), &[1i32, 1, 1]); - let list = Series::new(PlSmallStr::from_static("foo"), &[s0, s1]); + let list = Column::new(PlSmallStr::from_static("foo"), &[s0, s1]); let df = DataFrame::new(vec![list])?; let out = df.explode(["foo"])?; let out = out .column("foo")? + .as_materialized_series() .i32()? .into_no_null_iter() .collect::>(); diff --git a/crates/polars-core/src/frame/from.rs b/crates/polars-core/src/frame/from.rs index 5c3e1a8cb212..5ec5d98a1597 100644 --- a/crates/polars-core/src/frame/from.rs +++ b/crates/polars-core/src/frame/from.rs @@ -23,6 +23,7 @@ impl TryFrom for DataFrame { Some(&fld.metadata), ) } + .map(Column::from) }) .collect::>>()?; DataFrame::new(columns) diff --git a/crates/polars-core/src/frame/group_by/into_groups.rs b/crates/polars-core/src/frame/group_by/into_groups.rs index bdaa439a1232..519d0d2d0b0e 100644 --- a/crates/polars-core/src/frame/group_by/into_groups.rs +++ b/crates/polars-core/src/frame/group_by/into_groups.rs @@ -320,7 +320,7 @@ impl IntoGroupsProxy for ListChunked { sorted: bool, ) -> PolarsResult { multithreaded &= POOL.current_num_threads() > 1; - let by = &[self.clone().into_series()]; + let by = &[self.clone().into_column()]; let ca = if multithreaded { encode_rows_vertical_par_unordered(by).unwrap() } else { diff --git a/crates/polars-core/src/frame/group_by/mod.rs b/crates/polars-core/src/frame/group_by/mod.rs index 5dd631a51f0f..e02d6069c89a 100644 --- a/crates/polars-core/src/frame/group_by/mod.rs +++ b/crates/polars-core/src/frame/group_by/mod.rs @@ -28,7 +28,7 @@ use crate::prelude::sort::arg_sort_multiple::{ impl DataFrame { pub fn group_by_with_series( &self, - mut by: Vec, + mut by: Vec, multithreaded: bool, sorted: bool, ) -> PolarsResult { @@ -54,8 +54,8 @@ impl DataFrame { }; let groups = if by.len() == 1 { - let series = &by[0]; - series.group_tuples(multithreaded, sorted) + let column = &by[0]; + column.as_materialized_series().group_tuples(multithreaded, sorted) } else if by.iter().any(|s| s.dtype().is_object()) { #[cfg(feature = "object")] { @@ -116,7 +116,7 @@ impl DataFrame { I: IntoIterator, S: Into, { - let selected_keys = self.select_series(by)?; + let selected_keys = self.select_columns(by)?; self.group_by_with_series(selected_keys, true, false) } @@ -127,7 +127,7 @@ impl DataFrame { I: IntoIterator, S: Into, { - let selected_keys = self.select_series(by)?; + let selected_keys = self.select_columns(by)?; self.group_by_with_series(selected_keys, true, true) } } @@ -184,7 +184,7 @@ impl DataFrame { #[derive(Debug, Clone)] pub struct GroupBy<'df> { pub df: &'df DataFrame, - pub(crate) selected_keys: Vec, + pub(crate) selected_keys: Vec, // [first idx, [other idx]] groups: GroupsProxy, // columns selected for aggregation @@ -194,7 +194,7 @@ pub struct GroupBy<'df> { impl<'df> GroupBy<'df> { pub fn new( df: &'df DataFrame, - by: Vec, + by: Vec, groups: GroupsProxy, selected_agg: Option>, ) -> Self { @@ -245,7 +245,7 @@ impl<'df> GroupBy<'df> { std::mem::take(&mut self.groups) } - pub fn keys_sliced(&self, slice: Option<(i64, usize)>) -> Vec { + pub fn keys_sliced(&self, slice: Option<(i64, usize)>) -> Vec { #[allow(unused_assignments)] // needed to keep the lifetimes valid for this scope let mut groups_owned = None; @@ -260,6 +260,7 @@ impl<'df> GroupBy<'df> { POOL.install(|| { self.selected_keys .par_iter() + .map(Column::as_materialized_series) .map(|s| { match groups { GroupsProxy::Idx(groups) => { @@ -293,19 +294,20 @@ impl<'df> GroupBy<'df> { }, } }) + .map(|s| Column::from(s)) .collect() }) } - pub fn keys(&self) -> Vec { + pub fn keys(&self) -> Vec { self.keys_sliced(None) } - fn prepare_agg(&self) -> PolarsResult<(Vec, Vec)> { + fn prepare_agg(&self) -> PolarsResult<(Vec, Vec)> { let keys = self.keys(); let agg_col = match &self.selected_agg { - Some(selection) => self.df.select_series_impl(selection.as_slice()), + Some(selection) => self.df.select_columns_impl(selection.as_slice()), None => { let by: Vec<_> = self.selected_keys.iter().map(|s| s.name()).collect(); let selection = self @@ -316,7 +318,7 @@ impl<'df> GroupBy<'df> { .cloned() .collect::>(); - self.df.select_series_impl(selection.as_slice()) + self.df.select_columns_impl(selection.as_slice()) }, }?; @@ -394,7 +396,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Sum); let mut agg = unsafe { agg_col.agg_sum(&self.groups) }; agg.rename(new_name); - cols.push(agg); + cols.push(agg.into()); } DataFrame::new(cols) } @@ -431,7 +433,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Min); let mut agg = unsafe { agg_col.agg_min(&self.groups) }; agg.rename(new_name); - cols.push(agg); + cols.push(agg.into()); } DataFrame::new(cols) } @@ -468,7 +470,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Max); let mut agg = unsafe { agg_col.agg_max(&self.groups) }; agg.rename(new_name); - cols.push(agg); + cols.push(agg.into()); } DataFrame::new(cols) } @@ -505,7 +507,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::First); let mut agg = unsafe { agg_col.agg_first(&self.groups) }; agg.rename(new_name); - cols.push(agg); + cols.push(agg.into()); } DataFrame::new(cols) } @@ -542,7 +544,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Last); let mut agg = unsafe { agg_col.agg_last(&self.groups) }; agg.rename(new_name); - cols.push(agg); + cols.push(agg.into()); } DataFrame::new(cols) } @@ -579,7 +581,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::NUnique); let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) }; agg.rename(new_name); - cols.push(agg.into_series()); + cols.push(agg.into()); } DataFrame::new(cols) } @@ -614,7 +616,7 @@ impl<'df> GroupBy<'df> { ); let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, interpol) }; agg.rename(new_name); - cols.push(agg.into_series()); + cols.push(agg.into()); } DataFrame::new(cols) } @@ -636,7 +638,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Median); let mut agg = unsafe { agg_col.agg_median(&self.groups) }; agg.rename(new_name); - cols.push(agg.into_series()); + cols.push(agg); } DataFrame::new(cols) } @@ -649,7 +651,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Var(ddof)); let mut agg = unsafe { agg_col.agg_var(&self.groups, ddof) }; agg.rename(new_name); - cols.push(agg.into_series()); + cols.push(agg); } DataFrame::new(cols) } @@ -662,7 +664,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Std(ddof)); let mut agg = unsafe { agg_col.agg_std(&self.groups, ddof) }; agg.rename(new_name); - cols.push(agg.into_series()); + cols.push(agg); } DataFrame::new(cols) } @@ -704,7 +706,7 @@ impl<'df> GroupBy<'df> { ); let mut ca = self.groups.group_count(); ca.rename(new_name); - cols.push(ca.into_series()); + cols.push(ca.into_column()); } DataFrame::new(cols) } @@ -739,7 +741,7 @@ impl<'df> GroupBy<'df> { let mut column = self.groups.as_list_chunked(); let new_name = fmt_group_by_column("", GroupByMethod::Groups); column.rename(new_name); - cols.push(column.into_series()); + cols.push(column.into_column()); DataFrame::new(cols) } @@ -789,7 +791,7 @@ impl<'df> GroupBy<'df> { } else { let mut new_cols = Vec::with_capacity(self.selected_keys.len() + agg.len()); new_cols.extend_from_slice(&self.selected_keys); - let cols = self.df.select_series_impl(agg.as_slice())?; + let cols = self.df.select_columns_impl(agg.as_slice())?; new_cols.extend(cols); Ok(unsafe { DataFrame::new_no_checks(new_cols) }) } @@ -929,7 +931,7 @@ mod test { #[cfg(feature = "dtype-date")] #[cfg_attr(miri, ignore)] fn test_group_by() -> PolarsResult<()> { - let s0 = Series::new( + let s0 = Column::new( PlSmallStr::from_static("date"), &[ "2020-08-21", @@ -939,14 +941,14 @@ mod test { "2020-08-22", ], ); - let s1 = Series::new(PlSmallStr::from_static("temp"), [20, 10, 7, 9, 1]); - let s2 = Series::new(PlSmallStr::from_static("rain"), [0.2, 0.1, 0.3, 0.1, 0.01]); + let s1 = Column::new(PlSmallStr::from_static("temp"), [20, 10, 7, 9, 1]); + let s2 = Column::new(PlSmallStr::from_static("rain"), [0.2, 0.1, 0.3, 0.1, 0.01]); let df = DataFrame::new(vec![s0, s1, s2]).unwrap(); let out = df.group_by_stable(["date"])?.select(["temp"]).count()?; assert_eq!( out.column("temp_count")?, - &Series::new(PlSmallStr::from_static("temp_count"), [2 as IdxSize, 2, 1]) + &Column::new(PlSmallStr::from_static("temp_count"), [2 as IdxSize, 2, 1]) ); // Use of deprecated mean() for testing purposes @@ -958,7 +960,7 @@ mod test { .mean()?; assert_eq!( out.column("temp_mean")?, - &Series::new(PlSmallStr::from_static("temp_mean"), [15.0f64, 4.0, 9.0]) + &Column::new(PlSmallStr::from_static("temp_mean"), [15.0f64, 4.0, 9.0]) ); // Use of deprecated `mean()` for testing purposes @@ -975,7 +977,7 @@ mod test { let out = df.group_by_stable(["date"])?.select(["temp"]).sum()?; assert_eq!( out.column("temp_sum")?, - &Series::new(PlSmallStr::from_static("temp_sum"), [30, 8, 9]) + &Column::new(PlSmallStr::from_static("temp_sum"), [30, 8, 9]) ); // Use of deprecated `n_unique()` for testing purposes @@ -991,19 +993,19 @@ mod test { #[cfg_attr(miri, ignore)] fn test_static_group_by_by_12_columns() { // Build GroupBy DataFrame. - let s0 = Series::new("G1".into(), ["A", "A", "B", "B", "C"].as_ref()); - let s1 = Series::new("N".into(), [1, 2, 2, 4, 2].as_ref()); - let s2 = Series::new("G2".into(), ["k", "l", "m", "m", "l"].as_ref()); - let s3 = Series::new("G3".into(), ["a", "b", "c", "c", "d"].as_ref()); - let s4 = Series::new("G4".into(), ["1", "2", "3", "3", "4"].as_ref()); - let s5 = Series::new("G5".into(), ["X", "Y", "Z", "Z", "W"].as_ref()); - let s6 = Series::new("G6".into(), [false, true, true, true, false].as_ref()); - let s7 = Series::new("G7".into(), ["r", "x", "q", "q", "o"].as_ref()); - let s8 = Series::new("G8".into(), ["R", "X", "Q", "Q", "O"].as_ref()); - let s9 = Series::new("G9".into(), [1, 2, 3, 3, 4].as_ref()); - let s10 = Series::new("G10".into(), [".", "!", "?", "?", "/"].as_ref()); - let s11 = Series::new("G11".into(), ["(", ")", "@", "@", "$"].as_ref()); - let s12 = Series::new("G12".into(), ["-", "_", ";", ";", ","].as_ref()); + let s0 = Column::new("G1".into(), ["A", "A", "B", "B", "C"].as_ref()); + let s1 = Column::new("N".into(), [1, 2, 2, 4, 2].as_ref()); + let s2 = Column::new("G2".into(), ["k", "l", "m", "m", "l"].as_ref()); + let s3 = Column::new("G3".into(), ["a", "b", "c", "c", "d"].as_ref()); + let s4 = Column::new("G4".into(), ["1", "2", "3", "3", "4"].as_ref()); + let s5 = Column::new("G5".into(), ["X", "Y", "Z", "Z", "W"].as_ref()); + let s6 = Column::new("G6".into(), [false, true, true, true, false].as_ref()); + let s7 = Column::new("G7".into(), ["r", "x", "q", "q", "o"].as_ref()); + let s8 = Column::new("G8".into(), ["R", "X", "Q", "Q", "O"].as_ref()); + let s9 = Column::new("G9".into(), [1, 2, 3, 3, 4].as_ref()); + let s10 = Column::new("G10".into(), [".", "!", "?", "?", "/"].as_ref()); + let s11 = Column::new("G11".into(), ["(", ")", "@", "@", "$"].as_ref()); + let s12 = Column::new("G12".into(), ["-", "_", ";", ";", ","].as_ref()); let df = DataFrame::new(vec![s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]).unwrap(); @@ -1037,20 +1039,20 @@ mod test { ]; // Vector to contain every series. - let mut series = Vec::with_capacity(14); + let mut columns = Vec::with_capacity(14); // Create a series for every group name. for series_name in series_names { - let group_series = Series::new(series_name.into(), series_content.as_ref()); - series.push(group_series); + let group_columns = Column::new(series_name.into(), series_content.as_ref()); + columns.push(group_columns); } // Create a series for the aggregation column. - let agg_series = Series::new("N".into(), [1, 2, 3, 3, 4].as_ref()); - series.push(agg_series); + let agg_series = Column::new("N".into(), [1, 2, 3, 3, 4].as_ref()); + columns.push(agg_series); // Create the dataframe with the computed series. - let df = DataFrame::new(series).unwrap(); + let df = DataFrame::new(columns).unwrap(); // Use of deprecated `sum()` for testing purposes #[allow(deprecated)] @@ -1122,7 +1124,7 @@ mod test { .unwrap(); assert_eq!( - Vec::from(res.column("bar_sum").unwrap().i32().unwrap()), + Vec::from(res.column("bar_sum").unwrap().as_materialized_series().i32().unwrap()), &[Some(2), Some(2), Some(1)] ); } @@ -1139,7 +1141,7 @@ mod test { let out = df.group_by_stable(["a"])?.mean()?; assert_eq!( - Vec::from(out.column("b_mean")?.f64()?), + Vec::from(out.column("b_mean")?.as_materialized_series().f64()?), &[Some(1.5), Some(1.0)] ); Ok(()) diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index bcbf486e0877..17bd1936a8f6 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -1,12 +1,13 @@ use polars_error::{polars_ensure, polars_err, PolarsResult}; use polars_utils::aliases::PlHashSet; +use super::Column; use crate::datatypes::AnyValue; use crate::frame::DataFrame; -use crate::prelude::{PlSmallStr, Series}; +use crate::prelude::PlSmallStr; fn check_hstack( - col: &Series, + col: &Column, names: &mut PlHashSet, height: usize, is_empty: bool, @@ -30,7 +31,7 @@ impl DataFrame { /// The caller must ensure: /// - the length of all [`Series`] is equal to the height of this [`DataFrame`] /// - the columns names are unique - pub unsafe fn hstack_mut_unchecked(&mut self, columns: &[Series]) -> &mut Self { + pub unsafe fn hstack_mut_unchecked(&mut self, columns: &[Column]) -> &mut Self { self.columns.extend_from_slice(columns); self } @@ -46,7 +47,7 @@ impl DataFrame { /// df.hstack_mut(columns); /// } /// ``` - pub fn hstack_mut(&mut self, columns: &[Series]) -> PolarsResult<&mut Self> { + pub fn hstack_mut(&mut self, columns: &[Column]) -> PolarsResult<&mut Self> { let mut names = self .columns .iter() @@ -83,9 +84,11 @@ pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> Polars .map(|mut df| { if df.height() != max_len { let diff = max_len - df.height(); - df.columns - .iter_mut() - .for_each(|s| *s = s.extend_constant(AnyValue::Null, diff).unwrap()); + df.columns.iter_mut().for_each(|s| { + // @scalar-opt + let s = s.as_materialized_series_mut(); + *s = s.extend_constant(AnyValue::Null, diff).unwrap() + }); } df }) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 648141688db8..1308b4ef1cc5 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -1,11 +1,13 @@ //! DataFrame module. #[cfg(feature = "zip_with")] use std::borrow::Cow; +use std::sync::OnceLock; use std::{mem, ops}; use polars_utils::itertools::Itertools; use rayon::prelude::*; +use crate::chunked_array::metadata::MetadataFlags; #[cfg(feature = "algorithm_group_by")] use crate::chunked_array::ops::unique::is_unique_helper; use crate::prelude::*; @@ -37,7 +39,7 @@ use crate::hashing::_df_rows_to_hashes_threaded_vertical; #[cfg(feature = "zip_with")] use crate::prelude::min_max_binary::min_max_binary_series; use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort}; -use crate::series::IsSorted; +use crate::series::{BitRepr, IsSorted}; use crate::POOL; #[derive(Copy, Clone, Debug)] @@ -168,10 +170,559 @@ where /// ``` #[derive(Clone)] pub struct DataFrame { - pub(crate) columns: Vec, + // invariant: Column.len() is the same for each column + pub(crate) columns: Vec, +} + +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] +pub enum Column { + Series(Series), + Scalar(ScalarColumn), +} + +impl PartialEq for Column { + fn eq(&self, other: &Self) -> bool { + // @scalar-opt + self.as_materialized_series() + .eq(other.as_materialized_series()) + } +} + +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] +pub struct ScalarColumn { + name: PlSmallStr, + value: AnyValue<'static>, + // invariant: Series.len() == length + #[cfg_attr(feature = "serde", serde(skip))] + materialized: OnceLock, + length: usize, +} + +pub trait IntoColumn: Sized { + fn into_column(self) -> Column; +} + +impl IntoColumn for T { + fn into_column(self) -> Column { + IntoSeries::into_column(self) + } +} + +impl IntoColumn for Column { + fn into_column(self) -> Column { + self + } +} + +impl ScalarColumn { + #[inline] + pub fn new(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Self { + Self { + name, + value, + materialized: OnceLock::new(), + length, + } + } + + fn _to_series(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Series { + // @TODO: There is probably a better way to do this. + Scalar::new(value.dtype(), value) + .into_series(name) + .new_from_index(0, length) + } + + pub fn to_series(&self) -> Series { + Self::_to_series(self.name.clone(), self.value.clone(), self.length) + } + + pub fn as_materialized_series(&self) -> &Series { + self.materialized.get_or_init(|| self.to_series()) + } + + pub fn select_chunk(&self, _: usize) -> Series { + // @scalar-opt + // @scalar-correctness? + todo!() + } + + fn with_name(self, name: PlSmallStr) -> Self { + // @TODO: Keep materialized somehow? + Self::new(name, self.value, self.length) + } +} + +impl Column { + #[inline] + pub fn new(name: PlSmallStr, values: T) -> Self + where + Phantom: ?Sized, + Series: NamedFrom, + { + Self::Series(NamedFrom::new(name, values)) + } + + #[inline] + pub fn new_empty(name: PlSmallStr, dtype: &DataType) -> Self { + // @scalar-opt + Self::Series(Series::new_empty(name, &dtype)) + } + + #[inline] + pub fn new_scalar(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Self { + Self::Scalar(ScalarColumn::new(name, value, length)) + } + + #[inline] + pub fn as_materialized_series(&self) -> &Series { + match self { + Column::Series(s) => s, + Column::Scalar(s) => s.as_materialized_series(), + } + } + + #[inline] + pub fn as_materialized_series_mut(&mut self) -> &mut Series { + match self { + Column::Series(s) => s, + Column::Scalar(s) => { + *self = Column::Series(s.to_series()); + let Column::Series(s) = self else { + unreachable!(); + }; + s + }, + } + } + + #[inline] + pub fn dtype(&self) -> &DataType { + // @scalar-opt + self.as_materialized_series().dtype() + } + + #[inline] + pub fn field(&self) -> Cow { + // @scalar-opt + self.as_materialized_series().field() + } + + #[inline] + pub fn as_series(&self) -> Option<&Series> { + match self { + Column::Series(s) => Some(s), + Column::Scalar(_) => None, + } + } + + #[inline] + pub fn as_scalar_column(&self) -> Option<&ScalarColumn> { + match self { + Column::Series(_) => None, + Column::Scalar(s) => Some(s), + } + } + + pub fn i8(&self) -> PolarsResult<&Int8Chunked> { + // @scalar-opt + self.as_materialized_series().i8() + } + + pub fn i16(&self) -> PolarsResult<&Int16Chunked> { + // @scalar-opt + self.as_materialized_series().i16() + } + + pub fn i32(&self) -> PolarsResult<&Int32Chunked> { + // @scalar-opt + self.as_materialized_series().i32() + } + + pub fn i64(&self) -> PolarsResult<&Int64Chunked> { + // @scalar-opt + self.as_materialized_series().i64() + } + + pub fn u8(&self) -> PolarsResult<&UInt8Chunked> { + // @scalar-opt + self.as_materialized_series().u8() + } + + pub fn u16(&self) -> PolarsResult<&UInt16Chunked> { + // @scalar-opt + self.as_materialized_series().u16() + } + + pub fn u32(&self) -> PolarsResult<&UInt32Chunked> { + // @scalar-opt + self.as_materialized_series().u32() + } + + pub fn u64(&self) -> PolarsResult<&UInt64Chunked> { + // @scalar-opt + self.as_materialized_series().u64() + } + + pub fn f32(&self) -> PolarsResult<&Float32Chunked> { + // @scalar-opt + self.as_materialized_series().f32() + } + + pub fn f64(&self) -> PolarsResult<&Float64Chunked> { + // @scalar-opt + self.as_materialized_series().f64() + } + + pub fn str(&self) -> PolarsResult<&StringChunked> { + // @scalar-opt + self.as_materialized_series().str() + } + + #[inline] + pub fn rename(&mut self, name: PlSmallStr) { + match self { + Column::Series(s) => _ = s.rename(name), + Column::Scalar(s) => { + if let Some(series) = s.materialized.get_mut() { + series.rename(name.clone()); + } + + s.name = name; + }, + } + } + + pub fn clear(&self) -> Self { + match self { + Column::Series(s) => s.clear().into(), + Column::Scalar(s) => Self::new_scalar(s.name.clone(), s.value.clone(), 0), + } + } + + #[inline] + pub fn shrink_to_fit(&mut self) { + match self { + Column::Series(s) => s.shrink_to_fit(), + Column::Scalar(_) => {}, + } + } + + #[inline] + pub fn new_from_index(&self, index: usize, length: usize) -> Self { + // @scalar-opt + Self::Series(self.as_materialized_series().new_from_index(index, length)) + } + + #[inline] + pub fn len(&self) -> usize { + match self { + Column::Series(s) => s.len(), + Column::Scalar(s) => s.length, + } + } + + #[inline] + pub fn name(&self) -> &PlSmallStr { + match self { + Column::Series(s) => s.name(), + Column::Scalar(s) => &s.name, + } + } + + pub fn has_nulls(&self) -> bool { + // @scalar-opt + self.as_materialized_series().has_nulls() + } + + pub fn is_not_null(&self) -> ChunkedArray { + // @scalar-opt + self.as_materialized_series().is_not_null() + } + + pub fn to_physical_repr(&self) -> Column { + // @scalar-opt + self.as_materialized_series() + .to_physical_repr() + .into_owned() + .into() + } + + pub fn head(&self, length: Option) -> Column { + // @scalar-opt + self.as_materialized_series().head(length).into() + } + + pub fn tail(&self, length: Option) -> Column { + // @scalar-opt + self.as_materialized_series().tail(length).into() + } + + pub fn slice(&self, offset: i64, length: usize) -> Column { + // @scalar-opt + self.as_materialized_series().slice(offset, length).into() + } + + pub fn split_at(&self, offset: i64) -> (Column, Column) { + // @scalar-opt + let (l, r) = self.as_materialized_series().split_at(offset); + (l.into(), r.into()) + } + + pub fn null_count(&self) -> usize { + // @scalar-opt + self.as_materialized_series().null_count() + } + + pub unsafe fn agg_min(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_min(groups) }.into() + } + + pub unsafe fn agg_max(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_max(groups) }.into() + } + + pub unsafe fn agg_mean(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_mean(groups) }.into() + } + + pub unsafe fn agg_sum(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_sum(groups) }.into() + } + + pub unsafe fn agg_first(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_first(groups) }.into() + } + + pub unsafe fn agg_last(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_last(groups) }.into() + } + + pub unsafe fn agg_n_unique(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_n_unique(groups) }.into() + } + + pub unsafe fn agg_quantile( + &self, + groups: &GroupsProxy, + quantile: f64, + interpol: QuantileInterpolOptions, + ) -> Self { + // @scalar-opt + unsafe { + self.as_materialized_series() + .agg_quantile(groups, quantile, interpol) + } + .into() + } + + pub unsafe fn agg_median(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_median(groups) }.into() + } + + pub unsafe fn agg_var(&self, groups: &GroupsProxy, ddof: u8) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_var(groups, ddof) }.into() + } + + pub unsafe fn agg_std(&self, groups: &GroupsProxy, ddof: u8) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_std(groups, ddof) }.into() + } + + pub unsafe fn agg_list(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_list(groups) }.into() + } + + pub fn full_null(name: PlSmallStr, size: usize, dtype: &DataType) -> Column { + // @scalar-opt + Series::full_null(name, size, dtype).into() + } + + pub fn is_empty(&self) -> bool { + // @scalar-opt + self.as_materialized_series().is_empty() + } + + pub fn reverse(&self) -> Column { + // @scalar-opt + self.as_materialized_series().reverse().into() + } + + pub fn equals(&self, right: &Column) -> bool { + // @scalar-opt + self.as_materialized_series() + .equals(right.as_materialized_series()) + } + + pub fn equals_missing(&self, right: &Column) -> bool { + // @scalar-opt + self.as_materialized_series() + .equals_missing(right.as_materialized_series()) + } + + pub fn set_sorted_flag(&mut self, sorted: IsSorted) { + // @scalar-opt + match self { + Column::Series(s) => s.set_sorted_flag(sorted), + Column::Scalar(_) => {}, + } + } + + pub fn get_flags(&self) -> MetadataFlags { + match self { + Column::Series(s) => s.get_flags(), + // @scalar-opt + Column::Scalar(_) => MetadataFlags::empty(), + } + } + + pub fn get_data_ptr(&self) -> usize { + // @scalar-opt + self.as_materialized_series().get_data_ptr() + } + + pub fn vec_hash(&self, build_hasher: PlRandomState, buf: &mut Vec) -> PolarsResult<()> { + // @scalar-opt? + self.as_materialized_series().vec_hash(build_hasher, buf) + } + + pub fn vec_hash_combine( + &self, + build_hasher: PlRandomState, + hashes: &mut [u64], + ) -> PolarsResult<()> { + // @scalar-opt? + self.as_materialized_series() + .vec_hash_combine(build_hasher, hashes) + } + + pub(crate) unsafe fn equal_element( + &self, + idx_self: usize, + idx_other: usize, + other: &Column, + ) -> bool { + // @scalar-opt + unsafe { + self.as_materialized_series().equal_element( + idx_self, + idx_other, + other.as_materialized_series(), + ) + } + } + + pub fn categorical(&self) -> PolarsResult<&CategoricalChunked> { + self.as_materialized_series().categorical() + } + + pub fn with_name(self, name: PlSmallStr) -> Column { + match self { + Column::Series(s) => s.with_name(name).into(), + Column::Scalar(s) => s.with_name(name).into(), + } + } + + pub(crate) fn append(&mut self, other: &Column) -> PolarsResult<&mut Self> { + // @scalar-opt + self.as_materialized_series_mut() + .append(other.as_materialized_series())?; + Ok(self) + } + + pub fn arg_sort(&self, options: SortOptions) -> IdxCa { + // @scalar-opt + self.as_materialized_series().arg_sort(options) + } + + pub fn cast(&self, dtype: &DataType) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().cast(dtype).map(Column::from) + } + + pub fn idx(&self) -> PolarsResult<&IdxCa> { + // @scalar-opt + self.as_materialized_series().idx() + } + + pub fn binary(&self) -> PolarsResult<&BinaryChunked> { + // @scalar-opt + self.as_materialized_series().binary() + } + + pub fn bit_repr(&self) -> Option { + // @scalar-opt + self.as_materialized_series().bit_repr() + } + + pub fn bool(&self) -> PolarsResult<&BooleanChunked> { + // @scalar-opt + self.as_materialized_series().bool() + } + + pub fn struct_(&self) -> PolarsResult<&StructChunked> { + // @scalar-opt + self.as_materialized_series().struct_() + } + + pub fn into_frame(&self) -> DataFrame { + // @scalar-opt + self.as_materialized_series().clone().into_frame() + } + + pub fn unique_stable(&self) -> PolarsResult { + // @scalar-opt? + self.as_materialized_series().unique_stable().map(Column::from) + } + + pub fn extend(&mut self, other: &Column) -> PolarsResult<&mut Self> { + // @scalar-opt + self.as_materialized_series_mut().extend(other.as_materialized_series())?; + Ok(self) + } + + pub fn rechunk(&self) -> Column { + match self { + Column::Series(s) => s.rechunk().into(), + Column::Scalar(_) => self.clone(), + } + } +} + +impl From for Column { + #[inline] + fn from(value: Series) -> Self { + Self::Series(value) + } +} + +impl From for Column { + #[inline] + fn from(value: ScalarColumn) -> Self { + Self::Scalar(value) + } } impl DataFrame { + pub fn materialized_column_iter(&self) -> impl Iterator { + self.columns.iter().map(Column::as_materialized_series) + } + + pub fn par_materialized_column_iter(&self) -> impl ParallelIterator { + self.columns.par_iter().map(Column::as_materialized_series) + } + /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes. /// /// # Implementation @@ -185,36 +736,54 @@ impl DataFrame { /// /// FFI buffers are included in this estimation. pub fn estimated_size(&self) -> usize { - self.columns.iter().map(|s| s.estimated_size()).sum() + self.materialized_column_iter() + .map(|s| s.estimated_size()) + .sum() } // Reduce monomorphization. - pub fn _apply_columns(&self, func: &(dyn Fn(&Series) -> Series)) -> Vec { - self.columns.iter().map(func).collect() + pub fn _apply_columns(&self, func: &(dyn Fn(&Series) -> Series)) -> Vec { + self.materialized_column_iter() + .map(func) + .map(Column::from) + .collect() } // Reduce monomorphization. pub fn _apply_columns_par( &self, func: &(dyn Fn(&Series) -> Series + Send + Sync), - ) -> Vec { - POOL.install(|| self.columns.par_iter().map(func).collect()) + ) -> Vec { + POOL.install(|| { + self.par_materialized_column_iter() + .map(func) + .map(Column::from) + .collect() + }) } // Reduce monomorphization. fn try_apply_columns_par( &self, func: &(dyn Fn(&Series) -> PolarsResult + Send + Sync), - ) -> PolarsResult> { - POOL.install(|| self.columns.par_iter().map(func).collect()) + ) -> PolarsResult> { + POOL.install(|| { + self.par_materialized_column_iter() + .map(func) + .map(|s| s.map(Column::from)) + .collect() + }) } // Reduce monomorphization. fn try_apply_columns( &self, func: &(dyn Fn(&Series) -> PolarsResult + Send + Sync), - ) -> PolarsResult> { - self.columns.iter().map(func).collect() + ) -> PolarsResult> { + self.materialized_column_iter() + .map(func) + .map(|s| s.map(Column::from)) + .collect() } /// Get the index of the column. @@ -234,9 +803,11 @@ impl DataFrame { /// Reserve additional slots into the chunks of the series. pub(crate) fn reserve_chunks(&mut self, additional: usize) { for s in &mut self.columns { - // SAFETY: - // do not modify the data, simply resize. - unsafe { s.chunks_mut().reserve(additional) } + if let Column::Series(s) = s { + // SAFETY: + // do not modify the data, simply resize. + unsafe { s.chunks_mut().reserve(additional) } + } } } @@ -252,7 +823,7 @@ impl DataFrame { /// let df = DataFrame::new(vec![s0, s1])?; /// # Ok::<(), PolarsError>(()) /// ``` - pub fn new(columns: Vec) -> PolarsResult { + pub fn new(columns: Vec) -> PolarsResult { ensure_names_unique(&columns, |s| s.name().as_str())?; if columns.len() > 1 { @@ -271,7 +842,7 @@ impl DataFrame { /// Converts a sequence of columns into a DataFrame, broadcasting length-1 /// columns to match the other columns. - pub fn new_with_broadcast(columns: Vec) -> PolarsResult { + pub fn new_with_broadcast(columns: Vec) -> PolarsResult { ensure_names_unique(&columns, |s| s.name().as_str())?; unsafe { Self::new_with_broadcast_no_checks(columns) } } @@ -281,7 +852,7 @@ impl DataFrame { /// /// # Safety /// Does not check that the column names are unique (which they must be). - pub unsafe fn new_with_broadcast_no_checks(mut columns: Vec) -> PolarsResult { + pub unsafe fn new_with_broadcast_no_checks(mut columns: Vec) -> PolarsResult { // The length of the longest non-unit length column determines the // broadcast length. If all columns are unit-length the broadcast length // is one. @@ -327,7 +898,7 @@ impl DataFrame { pub fn empty_with_schema(schema: &Schema) -> Self { let cols = schema .iter() - .map(|(name, dtype)| Series::new_empty(name.clone(), dtype)) + .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype))) .collect(); unsafe { DataFrame::new_no_checks(cols) } } @@ -336,7 +907,7 @@ impl DataFrame { pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self { let cols = schema .iter_values() - .map(|fld| Series::new_empty(fld.name.clone(), &(fld.dtype().into()))) + .map(|fld| Column::from(Series::new_empty(fld.name.clone(), &(fld.dtype().into())))) .collect(); unsafe { DataFrame::new_no_checks(cols) } } @@ -357,7 +928,7 @@ impl DataFrame { /// assert!(df.is_empty()); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn pop(&mut self) -> Option { + pub fn pop(&mut self) -> Option { self.columns.pop() } @@ -404,7 +975,7 @@ impl DataFrame { (offset..(self.height() as IdxSize) + offset).collect(), ); ca.set_sorted_flag(IsSorted::Ascending); - columns.push(ca.into_series()); + columns.push(ca.into_series().into()); columns.extend_from_slice(&self.columns); DataFrame::new(columns) @@ -419,7 +990,7 @@ impl DataFrame { ); ca.set_sorted_flag(IsSorted::Ascending); - self.columns.insert(0, ca.into_series()); + self.columns.insert(0, ca.into_series().into()); self } @@ -431,7 +1002,7 @@ impl DataFrame { /// /// It is the callers responsibility to uphold the contract of all `Series` /// having an equal length and a unique name, if not this may panic down the line. - pub const unsafe fn new_no_checks(columns: Vec) -> DataFrame { + pub const unsafe fn new_no_checks(columns: Vec) -> DataFrame { DataFrame { columns } } @@ -444,7 +1015,7 @@ impl DataFrame { /// /// It is the callers responsibility to uphold the contract of all `Series` /// having an equal length, if not this may panic down the line. - pub unsafe fn new_no_length_checks(columns: Vec) -> PolarsResult { + pub unsafe fn new_no_length_checks(columns: Vec) -> PolarsResult { ensure_names_unique(&columns, |s| s.name().as_str())?; Ok(DataFrame { columns }) } @@ -461,7 +1032,9 @@ impl DataFrame { pub fn as_single_chunk(&mut self) -> &mut Self { // Don't parallelize this. Memory overhead for s in &mut self.columns { - *s = s.rechunk(); + if let Column::Series(s) = s { + *s = s.rechunk(); + } } self } @@ -480,12 +1053,17 @@ impl DataFrame { pub fn should_rechunk(&self) -> bool { // Fast check. It is also needed for correctness, as code below doesn't check if the number // of chunks is equal. - if !self.get_columns().iter().map(|s| s.n_chunks()).all_equal() { + if !self + .get_columns() + .iter() + .filter_map(|c| c.as_series().map(|s| s.n_chunks())) + .all_equal() + { return true; } // From here we check chunk lengths. - let mut chunk_lengths = self.columns.iter().map(|s| s.chunk_lengths()); + let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths()); match chunk_lengths.next() { None => false, Some(first_column_chunk_lengths) => { @@ -538,8 +1116,7 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn schema(&self) -> Schema { - self.columns - .iter() + self.materialized_column_iter() .map(|x| (x.name().clone(), x.dtype().clone())) .collect() } @@ -559,7 +1136,7 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` #[inline] - pub fn get_columns(&self) -> &[Series] { + pub fn get_columns(&self) -> &[Column] { &self.columns } @@ -568,12 +1145,12 @@ impl DataFrame { /// /// # Safety /// The caller must ensure the length of all [`Series`] remains equal. - pub unsafe fn get_columns_mut(&mut self) -> &mut Vec { + pub unsafe fn get_columns_mut(&mut self) -> &mut Vec { &mut self.columns } /// Take ownership of the underlying columns vec. - pub fn take_columns(self) -> Vec { + pub fn take_columns(self) -> Vec { self.columns } @@ -594,8 +1171,8 @@ impl DataFrame { /// assert_eq!(iterator.next(), None); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn iter(&self) -> std::slice::Iter<'_, Series> { - self.columns.iter() + pub fn iter(&self) -> impl Iterator { + self.materialized_column_iter() } /// # Example @@ -678,9 +1255,14 @@ impl DataFrame { self.columns.iter().map(|s| s.dtype().clone()).collect() } + pub(crate) fn first_series_column(&self) -> Option<&Series> { + self.columns.iter().find_map(|col| col.as_series()) + } + /// The number of chunks per column pub fn n_chunks(&self) -> usize { - match self.columns.first() { + // @scalar-correctness? + match self.first_series_column() { None => 0, Some(s) => s.n_chunks(), } @@ -821,7 +1403,7 @@ impl DataFrame { /// | Gold | 79 | 79 | /// +---------+--------+----------+ /// ``` - pub fn hstack(&self, columns: &[Series]) -> PolarsResult { + pub fn hstack(&self, columns: &[Column]) -> PolarsResult { let mut new_cols = self.columns.clone(); new_cols.extend_from_slice(columns); DataFrame::new(new_cols) @@ -929,6 +1511,10 @@ impl DataFrame { .iter_mut() .zip(other.columns.iter()) .try_for_each::<_, PolarsResult<_>>(|(left, right)| { + // @scalar-opt + let left = left.as_materialized_series_mut(); + let right = right.as_materialized_series(); + ensure_can_extend(left, right)?; left.append(right)?; Ok(()) @@ -947,6 +1533,10 @@ impl DataFrame { .iter_mut() .zip(other.columns.iter()) .for_each(|(left, right)| { + // @scalar-opt + let left = left.as_materialized_series_mut(); + let right = right.as_materialized_series(); + left.append(right).expect("should not fail"); }); } @@ -976,6 +1566,9 @@ impl DataFrame { .iter_mut() .zip(other.columns.iter()) .try_for_each::<_, PolarsResult<_>>(|(left, right)| { + let left = left.as_materialized_series_mut(); + let right = right.as_materialized_series(); + ensure_can_extend(left, right)?; left.extend(right)?; Ok(()) @@ -991,14 +1584,14 @@ impl DataFrame { /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"], /// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?; /// - /// let s1: PolarsResult = df.drop_in_place("Average weight"); + /// let s1: PolarsResult = df.drop_in_place("Average weight"); /// assert!(s1.is_err()); /// - /// let s2: Series = df.drop_in_place("Animal")?; - /// assert_eq!(s2, Series::new("Animal".into(), &["Tiger", "Lion", "Great auk"])); + /// let s2: Column = df.drop_in_place("Animal")?; + /// assert_eq!(s2, Column::new_series("Animal".into(), &["Tiger", "Lion", "Great auk"])); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn drop_in_place(&mut self, name: &str) -> PolarsResult { + pub fn drop_in_place(&mut self, name: &str) -> PolarsResult { let idx = self.check_name_to_idx(name)?; Ok(self.columns.remove(idx)) } @@ -1036,14 +1629,14 @@ impl DataFrame { for<'a> &'a S: Into, { if let Some(v) = subset { - let v = self.select_series(v)?; + let v = self.select_columns(v)?; self._drop_nulls_impl(v.as_slice()) } else { self._drop_nulls_impl(self.columns.as_slice()) } } - fn _drop_nulls_impl(&self, subset: &[Series]) -> PolarsResult { + fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult { // fast path for no nulls in df if subset.iter().all(|s| !s.has_nulls()) { return Ok(self.clone()); @@ -1056,8 +1649,8 @@ impl DataFrame { .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?; let mut mask = mask.is_not_null(); - for s in iter { - mask = mask & s.is_not_null(); + for c in iter { + mask = mask & c.is_not_null(); } self.filter(&mask) } @@ -1119,63 +1712,63 @@ impl DataFrame { fn insert_column_no_name_check( &mut self, index: usize, - series: Series, + column: Column, ) -> PolarsResult<&mut Self> { polars_ensure!( - self.width() == 0 || series.len() == self.height(), + self.width() == 0 || column.len() == self.height(), ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}", - series.len(), self.height(), + column.len(), self.height(), ); - self.columns.insert(index, series); + self.columns.insert(index, column); Ok(self) } /// Insert a new column at a given index. - pub fn insert_column( + pub fn insert_column( &mut self, index: usize, column: S, ) -> PolarsResult<&mut Self> { - let series = column.into_series(); - self.check_already_present(series.name().as_str())?; - self.insert_column_no_name_check(index, series) + let column = column.into_column(); + self.check_already_present(column.name().as_str())?; + self.insert_column_no_name_check(index, column) } - fn add_column_by_search(&mut self, series: Series) -> PolarsResult<()> { - if let Some(idx) = self.get_column_index(series.name().as_str()) { - self.replace_column(idx, series)?; + fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> { + if let Some(idx) = self.get_column_index(column.name().as_str()) { + self.replace_column(idx, column)?; } else { - self.columns.push(series); + self.columns.push(column); } Ok(()) } /// Add a new column to this [`DataFrame`] or replace an existing one. - pub fn with_column(&mut self, column: S) -> PolarsResult<&mut Self> { - fn inner(df: &mut DataFrame, mut series: Series) -> PolarsResult<&mut DataFrame> { + pub fn with_column(&mut self, column: C) -> PolarsResult<&mut Self> { + fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> { let height = df.height(); - if series.len() == 1 && height > 1 { - series = series.new_from_index(0, height); + if column.len() == 1 && height > 1 { + column = column.new_from_index(0, height); } - if series.len() == height || df.get_columns().is_empty() { - df.add_column_by_search(series)?; + if column.len() == height || df.get_columns().is_empty() { + df.add_column_by_search(column)?; Ok(df) } // special case for literals - else if height == 0 && series.len() == 1 { - let s = series.clear(); + else if height == 0 && column.len() == 1 { + let s = column.clear(); df.add_column_by_search(s)?; Ok(df) } else { polars_bail!( ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}", - series.len(), height, + column.len(), height, ); } } - let series = column.into_series(); - inner(self, series) + let column = column.into_column(); + inner(self, column) } /// Adds a column to the [`DataFrame`] without doing any checks @@ -1195,17 +1788,17 @@ impl DataFrame { } } - fn add_column_by_schema(&mut self, s: Series, schema: &Schema) -> PolarsResult<()> { - let name = s.name(); + fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> { + let name = c.name(); if let Some((idx, _, _)) = schema.get_full(name.as_str()) { // schema is incorrect fallback to search if self.columns.get(idx).map(|s| s.name()) != Some(name) { - self.add_column_by_search(s)?; + self.add_column_by_search(c)?; } else { - self.replace_column(idx, s)?; + self.replace_column(idx, c)?; } } else { - self.columns.push(s); + self.columns.push(c); } Ok(()) } @@ -1226,31 +1819,31 @@ impl DataFrame { /// Add a new column to this [`DataFrame`] or replace an existing one. /// Uses an existing schema to amortize lookups. /// If the schema is incorrect, we will fallback to linear search. - pub fn with_column_and_schema( + pub fn with_column_and_schema( &mut self, - column: S, + column: C, schema: &Schema, ) -> PolarsResult<&mut Self> { - let mut series = column.into_series(); + let mut column = column.into_column(); let height = self.height(); - if series.len() == 1 && height > 1 { - series = series.new_from_index(0, height); + if column.len() == 1 && height > 1 { + column = column.new_from_index(0, height); } - if series.len() == height || self.columns.is_empty() { - self.add_column_by_schema(series, schema)?; + if column.len() == height || self.columns.is_empty() { + self.add_column_by_schema(column, schema)?; Ok(self) } // special case for literals - else if height == 0 && series.len() == 1 { - let s = series.clear(); + else if height == 0 && column.len() == 1 { + let s = column.clear(); self.add_column_by_schema(s, schema)?; Ok(self) } else { polars_bail!( ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}", - series.len(), height, + column.len(), height, ); } } @@ -1275,7 +1868,13 @@ impl DataFrame { None => return None, } // SAFETY: we just checked bounds - unsafe { Some(self.columns.iter().map(|s| s.get_unchecked(idx)).collect()) } + unsafe { + Some( + self.materialized_column_iter() + .map(|s| s.get_unchecked(idx)) + .collect(), + ) + } } /// Select a [`Series`] by index. @@ -1293,7 +1892,7 @@ impl DataFrame { /// assert_eq!(s1, Some(&s2)); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn select_at_idx(&self, idx: usize) -> Option<&Series> { + pub fn select_at_idx(&self, idx: usize) -> Option<&Column> { self.columns.get(idx) } @@ -1301,7 +1900,7 @@ impl DataFrame { /// /// *Note: the length of the Series should remain the same otherwise the DataFrame is invalid.* /// For this reason the method is not public - fn select_at_idx_mut(&mut self, idx: usize) -> Option<&mut Series> { + fn select_at_idx_mut(&mut self, idx: usize) -> Option<&mut Column> { self.columns.get_mut(idx) } @@ -1407,7 +2006,7 @@ impl DataFrame { /// assert_eq!(df.column("Password")?, &s1); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn column(&self, name: &str) -> PolarsResult<&Series> { + pub fn column(&self, name: &str) -> PolarsResult<&Column> { let idx = self.try_get_column_index(name)?; Ok(self.select_at_idx(idx).unwrap()) } @@ -1426,7 +2025,7 @@ impl DataFrame { /// assert_eq!(&df[1], sv[1]); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn columns(&self, names: I) -> PolarsResult> + pub fn columns(&self, names: I) -> PolarsResult> where I: IntoIterator, S: AsRef, @@ -1462,7 +2061,7 @@ impl DataFrame { } pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult { - let selected = self.select_series_impl(cols)?; + let selected = self.select_columns_impl(cols)?; Ok(unsafe { DataFrame::new_no_checks(selected) }) } @@ -1499,16 +2098,16 @@ impl DataFrame { if check_duplicates { ensure_names_unique(cols, |s| s.as_str())?; } - let selected = self.select_series_impl_with_schema(cols, schema)?; + let selected = self.select_columns_impl_with_schema(cols, schema)?; Ok(unsafe { DataFrame::new_no_checks(selected) }) } /// A non generic implementation to reduce compiler bloat. - fn select_series_impl_with_schema( + fn select_columns_impl_with_schema( &self, cols: &[PlSmallStr], schema: &Schema, - ) -> PolarsResult> { + ) -> PolarsResult> { cols.iter() .map(|name| { let index = schema.try_get_full(name.as_str())?.0; @@ -1528,7 +2127,7 @@ impl DataFrame { fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult { ensure_names_unique(cols, |s| s.as_str())?; - let selected = self.select_series_physical_impl(cols)?; + let selected = self.select_columns_physical_impl(cols)?; Ok(unsafe { DataFrame::new_no_checks(selected) }) } @@ -1547,9 +2146,9 @@ impl DataFrame { /// assert_eq!(df["Hydrogen"], sv[1]); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn select_series(&self, selection: impl IntoVec) -> PolarsResult> { + pub fn select_columns(&self, selection: impl IntoVec) -> PolarsResult> { let cols = selection.into_vec(); - self.select_series_impl(&cols) + self.select_columns_impl(&cols) } fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> { @@ -1561,7 +2160,7 @@ impl DataFrame { } /// A non generic implementation to reduce compiler bloat. - fn select_series_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult> { + fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult> { let selected = if cols.len() > 1 && self.columns.len() > 10 { let name_to_idx = self._names_to_idx_map(); cols.iter() @@ -1569,19 +2168,12 @@ impl DataFrame { let idx = *name_to_idx .get(name.as_str()) .ok_or_else(|| polars_err!(col_not_found = name))?; - Ok(self - .select_at_idx(idx) - .unwrap() - .to_physical_repr() - .into_owned()) + Ok(self.select_at_idx(idx).unwrap().to_physical_repr()) }) .collect::>>()? } else { cols.iter() - .map(|c| { - self.column(c.as_str()) - .map(|s| s.to_physical_repr().into_owned()) - }) + .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr())) .collect::>>()? }; @@ -1589,7 +2181,7 @@ impl DataFrame { } /// A non generic implementation to reduce compiler bloat. - fn select_series_impl(&self, cols: &[PlSmallStr]) -> PolarsResult> { + fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult> { let selected = if cols.len() > 1 && self.columns.len() > 10 { // we hash, because there are user that having millions of columns. // # https://github.com/pola-rs/polars/issues/1023 @@ -1615,7 +2207,7 @@ impl DataFrame { /// Select a mutable series by name. /// *Note: the length of the Series should remain the same otherwise the DataFrame is invalid.* /// For this reason the method is not public - fn select_mut(&mut self, name: &str) -> Option<&mut Series> { + fn select_mut(&mut self, name: &str) -> Option<&mut Column> { let opt_idx = self.get_column_index(name); opt_idx.and_then(|idx| self.select_at_idx_mut(idx)) @@ -1672,7 +2264,10 @@ impl DataFrame { let cols = if allow_threads { POOL.install(|| self._apply_columns_par(&|s| s.take_unchecked(idx))) } else { - self.columns.iter().map(|s| s.take_unchecked(idx)).collect() + self.materialized_column_iter() + .map(|s| s.take_unchecked(idx)) + .map(Column::from) + .collect() }; unsafe { DataFrame::new_no_checks(cols) } } @@ -1685,9 +2280,9 @@ impl DataFrame { let cols = if allow_threads { POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx))) } else { - self.columns - .iter() + self.materialized_column_iter() .map(|s| s.take_slice_unchecked(idx)) + .map(Column::from) .collect() }; unsafe { DataFrame::new_no_checks(cols) } @@ -1715,7 +2310,7 @@ impl DataFrame { ); self.select_mut(column) .ok_or_else(|| polars_err!(col_not_found = column)) - .map(|s| s.rename(name))?; + .map(|c| c.rename(name))?; Ok(self) } @@ -1727,7 +2322,7 @@ impl DataFrame { by: impl IntoVec, sort_options: SortMultipleOptions, ) -> PolarsResult<&mut Self> { - let by_column = self.select_series(by)?; + let by_column = self.select_columns(by)?; self.columns = self.sort_impl(by_column, sort_options, None)?.columns; Ok(self) } @@ -1736,7 +2331,7 @@ impl DataFrame { /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization. pub fn sort_impl( &self, - by_column: Vec, + by_column: Vec, mut sort_options: SortMultipleOptions, slice: Option<(i64, usize)>, ) -> PolarsResult { @@ -1786,7 +2381,7 @@ impl DataFrame { let df = df.as_single_chunk_par(); let mut take = match (by_column.len(), has_struct) { (1, false) => { - let s = &by_column[0]; + let s = &by_column[0].as_materialized_series(); let options = SortOptions { descending: sort_options.descending[0], nulls_last: sort_options.nulls_last[0], @@ -1818,7 +2413,9 @@ impl DataFrame { )? } else { let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?; - first.arg_sort_multiple(&other, &sort_options)? + first + .as_materialized_series() + .arg_sort_multiple(&other, &sort_options)? } }, }; @@ -1925,10 +2522,10 @@ impl DataFrame { /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn replace_column( + pub fn replace_column( &mut self, index: usize, - new_column: S, + new_column: C, ) -> PolarsResult<&mut Self> { polars_ensure!( index < self.width(), @@ -1936,7 +2533,7 @@ impl DataFrame { "unable to replace at index {}, the DataFrame has only {} columns", index, self.width(), ); - let mut new_column = new_column.into_series(); + let mut new_column = new_column.into_column(); polars_ensure!( new_column.len() == self.height(), ShapeMismatch: @@ -1988,13 +2585,13 @@ impl DataFrame { /// | "egg" | 3 | /// +--------+-------+ /// ``` - pub fn apply(&mut self, name: &str, f: F) -> PolarsResult<&mut Self> + pub fn apply(&mut self, name: &str, f: F) -> PolarsResult<&mut Self> where - F: FnOnce(&Series) -> S, - S: IntoSeries, + F: FnOnce(&Series) -> C, + C: IntoColumn, { let idx = self.check_name_to_idx(name)?; - self.apply_at_idx(idx, f) + self.apply_at_idx(idx, |c| f(c.as_materialized_series())) } /// Apply a closure to a column at index `idx`. This is the recommended way to do in place @@ -2027,10 +2624,10 @@ impl DataFrame { /// | "egg" | 111 | /// +--------+-------+ /// ``` - pub fn apply_at_idx(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self> + pub fn apply_at_idx(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self> where - F: FnOnce(&Series) -> S, - S: IntoSeries, + F: FnOnce(&Column) -> C, + C: IntoColumn, { let df_height = self.height(); let width = self.width(); @@ -2041,7 +2638,7 @@ impl DataFrame { ) })?; let name = col.name().clone(); - let new_col = f(col).into_series(); + let new_col = f(col).into_column(); match new_col.len() { 1 => { let new_col = new_col.new_from_index(0, df_height); @@ -2105,10 +2702,10 @@ impl DataFrame { /// | "quack-is-modified" | 5 | /// +---------------------+--------+ /// ``` - pub fn try_apply_at_idx(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self> + pub fn try_apply_at_idx(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self> where - F: FnOnce(&Series) -> PolarsResult, - S: IntoSeries, + F: FnOnce(&Column) -> PolarsResult, + C: IntoColumn, { let width = self.width(); let col = self.columns.get_mut(idx).ok_or_else(|| { @@ -2119,7 +2716,7 @@ impl DataFrame { })?; let name = col.name().clone(); - let _ = mem::replace(col, f(col).map(|s| s.into_series())?); + let _ = mem::replace(col, f(col).map(|c| c.into_column())?); // make sure the name remains the same after applying the closure unsafe { @@ -2171,13 +2768,13 @@ impl DataFrame { /// | "not_within_bounds" | 5 | /// +---------------------+--------+ /// ``` - pub fn try_apply(&mut self, column: &str, f: F) -> PolarsResult<&mut Self> + pub fn try_apply(&mut self, column: &str, f: F) -> PolarsResult<&mut Self> where - F: FnOnce(&Series) -> PolarsResult, - S: IntoSeries, + F: FnOnce(&Series) -> PolarsResult, + C: IntoColumn, { let idx = self.try_get_column_index(column)?; - self.try_apply_at_idx(idx, f) + self.try_apply_at_idx(idx, |c| f(c.as_materialized_series())) } /// Slice the [`DataFrame`] along the rows. @@ -2243,6 +2840,7 @@ impl DataFrame { if offset == 0 && length == self.height() { return self.clone(); } + // @scalar-opt let columns = self._apply_columns_par(&|s| s.slice(offset, length)); unsafe { DataFrame::new_no_checks(columns) } } @@ -2252,6 +2850,7 @@ impl DataFrame { if offset == 0 && length == self.height() { return self.clone(); } + // @scalar-opt let columns = self._apply_columns(&|s| { let mut out = s.slice(offset, length); out.shrink_to_fit(); @@ -2298,7 +2897,7 @@ impl DataFrame { let col = self .columns .iter() - .map(|s| s.head(length)) + .map(|c| c.head(length)) .collect::>(); unsafe { DataFrame::new_no_checks(col) } } @@ -2338,7 +2937,7 @@ impl DataFrame { let col = self .columns .iter() - .map(|s| s.tail(length)) + .map(|c| c.tail(length)) .collect::>(); unsafe { DataFrame::new_no_checks(col) } } @@ -2385,7 +2984,10 @@ impl DataFrame { /// as well. pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> { PhysRecordBatchIter { - iters: self.columns.iter().map(|s| s.chunks().iter()).collect(), + iters: self + .materialized_column_iter() + .map(|s| s.chunks().iter()) + .collect(), } } @@ -2427,14 +3029,19 @@ impl DataFrame { match self.columns.len() { 0 => Ok(None), - 1 => Ok(Some(self.columns[0].clone())), - 2 => min_fn(&self.columns[0], &self.columns[1]).map(Some), + 1 => Ok(Some( + self.columns[0].clone().as_materialized_series().clone(), + )), + 2 => min_fn( + &self.columns[0].as_materialized_series(), + &self.columns[1].as_materialized_series(), + ) + .map(Some), _ => { // the try_reduce_with is a bit slower in parallelism, // but I don't think it matters here as we parallelize over columns, not over elements POOL.install(|| { - self.columns - .par_iter() + self.par_materialized_column_iter() .map(|s| Ok(Cow::Borrowed(s))) .try_reduce_with(|l, r| min_fn(&l, &r).map(Cow::Owned)) // we can unwrap the option, because we are certain there is a column @@ -2453,14 +3060,17 @@ impl DataFrame { match self.columns.len() { 0 => Ok(None), - 1 => Ok(Some(self.columns[0].clone())), - 2 => max_fn(&self.columns[0], &self.columns[1]).map(Some), + 1 => Ok(Some(self.columns[0].as_materialized_series().clone())), + 2 => max_fn( + &self.columns[0].as_materialized_series(), + &self.columns[1].as_materialized_series(), + ) + .map(Some), _ => { // the try_reduce_with is a bit slower in parallelism, // but I don't think it matters here as we parallelize over columns, not over elements POOL.install(|| { - self.columns - .par_iter() + self.par_materialized_column_iter() .map(|s| Ok(Cow::Borrowed(s))) .try_reduce_with(|l, r| max_fn(&l, &r).map(Cow::Owned)) // we can unwrap the option, because we are certain there is a column @@ -2494,8 +3104,7 @@ impl DataFrame { }; let non_null_cols = self - .columns - .iter() + .materialized_column_iter() .filter(|x| x.dtype() != &DataType::Null) .collect::>(); @@ -2505,7 +3114,7 @@ impl DataFrame { Ok(None) } else { // all columns are null dtype, so result is null dtype - Ok(Some(self.columns[0].clone())) + Ok(Some(self.columns[0].as_materialized_series().clone())) } }, 1 => Ok(Some(apply_null_strategy( @@ -2545,9 +3154,11 @@ impl DataFrame { 0 => Ok(None), 1 => Ok(Some(match self.columns[0].dtype() { dt if dt != &DataType::Float32 && (dt.is_numeric() || dt == &DataType::Boolean) => { - self.columns[0].cast(&DataType::Float64)? + self.columns[0] + .as_materialized_series() + .cast(&DataType::Float64)? }, - _ => self.columns[0].clone(), + _ => self.columns[0].as_materialized_series().clone(), })), _ => { let columns = self @@ -2565,8 +3176,7 @@ impl DataFrame { let null_count = || { numeric_df - .columns - .par_iter() + .par_materialized_column_iter() .map(|s| { s.is_null() .cast_with_options(&DataType::UInt32, CastOptions::NonStrict) @@ -2817,7 +3427,7 @@ impl DataFrame { let cols = self .columns .iter() - .map(|s| Series::new(s.name().clone(), [s.null_count() as IdxSize])) + .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize])) .collect(); unsafe { Self::new_no_checks(cols) } } @@ -2982,8 +3592,14 @@ impl DataFrame { let mut count = 0; for s in &self.columns { if cols.contains(s.name()) { - let ca = s.struct_()?.clone(); - new_cols.extend_from_slice(&ca.fields_as_series()); + let ca = s.as_materialized_series().struct_()?.clone(); + // @scalar-opt + new_cols.extend_from_slice( + &ca.fields_as_series() + .into_iter() + .map(Column::from) + .collect::>(), + ); count += 1; } else { new_cols.push(s.clone()) @@ -3004,7 +3620,7 @@ impl DataFrame { } pub struct RecordBatchIter<'a> { - columns: &'a Vec, + columns: &'a Vec, idx: usize, n_chunks: usize, compat_level: CompatLevel, @@ -3023,11 +3639,13 @@ impl<'a> Iterator for RecordBatchIter<'a> { let iter = self .columns .par_iter() + .map(Column::as_materialized_series) .map(|s| s.to_arrow(self.idx, self.compat_level)); POOL.install(|| iter.collect()) } else { self.columns .iter() + .map(Column::as_materialized_series) .map(|s| s.to_arrow(self.idx, self.compat_level)) .collect() }; @@ -3073,7 +3691,7 @@ impl Default for DataFrame { } } -impl From for Vec { +impl From for Vec { fn from(df: DataFrame) -> Self { df.columns } @@ -3094,8 +3712,8 @@ mod test { use super::*; fn create_frame() -> DataFrame { - let s0 = Series::new("days".into(), [0, 1, 2].as_ref()); - let s1 = Series::new("temp".into(), [22.1, 19.9, 7.].as_ref()); + let s0 = Column::new("days".into(), [0, 1, 2].as_ref()); + let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref()); DataFrame::new(vec![s0, s1]).unwrap() } @@ -3115,7 +3733,16 @@ mod test { #[cfg_attr(miri, ignore)] fn test_select() { let df = create_frame(); - assert_eq!(df.column("days").unwrap().equal(1).unwrap().sum(), Some(1)); + assert_eq!( + df.column("days") + .unwrap() + .as_series() + .unwrap() + .equal(1) + .unwrap() + .sum(), + Some(1) + ); } #[test] @@ -3123,13 +3750,25 @@ mod test { fn test_filter_broadcast_on_string_col() { let col_name = "some_col"; let v = vec!["test".to_string()]; - let s0 = Series::new(PlSmallStr::from_str(col_name), v); + let s0 = Column::new(PlSmallStr::from_str(col_name), v); let mut df = DataFrame::new(vec![s0]).unwrap(); df = df - .filter(&df.column(col_name).unwrap().equal("").unwrap()) + .filter( + &df.column(col_name) + .unwrap() + .as_materialized_series() + .equal("") + .unwrap(), + ) .unwrap(); - assert_eq!(df.column(col_name).unwrap().n_chunks(), 1); + assert_eq!( + df.column(col_name) + .unwrap() + .as_materialized_series() + .n_chunks(), + 1 + ); } #[test] @@ -3235,9 +3874,9 @@ mod test { #[cfg(feature = "zip_with")] #[cfg_attr(miri, ignore)] fn test_horizontal_agg() { - let a = Series::new("a".into(), [1, 2, 6]); - let b = Series::new("b".into(), [Some(1), None, None]); - let c = Series::new("c".into(), [Some(4), None, Some(3)]); + let a = Column::new("a".into(), [1, 2, 6]); + let b = Column::new("b".into(), [Some(1), None, None]); + let c = Column::new("c".into(), [Some(4), None, Some(3)]); let df = DataFrame::new(vec![a, b, c]).unwrap(); assert_eq!( diff --git a/crates/polars-core/src/frame/row/dataframe.rs b/crates/polars-core/src/frame/row/dataframe.rs index 4a40a9ed6d6f..1d11dcd9ecc0 100644 --- a/crates/polars-core/src/frame/row/dataframe.rs +++ b/crates/polars-core/src/frame/row/dataframe.rs @@ -4,8 +4,7 @@ impl DataFrame { /// Get a row from a [`DataFrame`]. Use of this is discouraged as it will likely be slow. pub fn get_row(&self, idx: usize) -> PolarsResult { let values = self - .columns - .iter() + .materialized_column_iter() .map(|s| s.get(idx)) .collect::>>()?; Ok(Row(values)) @@ -15,7 +14,7 @@ impl DataFrame { /// The caller is responsible to make sure that the row has at least the capacity for the number /// of columns in the [`DataFrame`] pub fn get_row_amortized<'a>(&'a self, idx: usize, row: &mut Row<'a>) -> PolarsResult<()> { - for (s, any_val) in self.columns.iter().zip(&mut row.0) { + for (s, any_val) in self.materialized_column_iter().zip(&mut row.0) { *any_val = s.get(idx)?; } Ok(()) @@ -29,8 +28,7 @@ impl DataFrame { /// Does not do any bounds checking. #[inline] pub unsafe fn get_row_amortized_unchecked<'a>(&'a self, idx: usize, row: &mut Row<'a>) { - self.columns - .iter() + self.materialized_column_iter() .zip(&mut row.0) .for_each(|(s, any_val)| { *any_val = s.get_unchecked(idx); @@ -75,14 +73,14 @@ impl DataFrame { .into_iter() .zip(schema.iter_names()) .map(|(b, name)| { - let mut s = b.into_series(); + let mut c = b.into_series().into_column(); // if the schema adds a column not in the rows, we // fill it with nulls - if s.is_empty() { - Series::full_null(name.clone(), expected_len, s.dtype()) + if c.is_empty() { + Column::full_null(name.clone(), expected_len, c.dtype()) } else { - s.rename(name.clone()); - s + c.rename(name.clone()); + c } }) .collect(); @@ -117,14 +115,14 @@ impl DataFrame { .into_iter() .zip(schema.iter_names()) .map(|(b, name)| { - let mut s = b.into_series(); + let mut c = b.into_series().into_column(); // if the schema adds a column not in the rows, we // fill it with nulls - if s.is_empty() { - Series::full_null(name.clone(), expected_len, s.dtype()) + if c.is_empty() { + Column::full_null(name.clone(), expected_len, c.dtype()) } else { - s.rename(name.clone()); - s + c.rename(name.clone()); + c } }) .collect(); diff --git a/crates/polars-core/src/frame/row/mod.rs b/crates/polars-core/src/frame/row/mod.rs index 44e445b0874e..87904e6f98cb 100644 --- a/crates/polars-core/src/frame/row/mod.rs +++ b/crates/polars-core/src/frame/row/mod.rs @@ -68,7 +68,7 @@ impl DataFrame { let width = self.width(); let size = width * self.height(); let mut buf = vec![AnyValue::Null; size]; - for (col_i, s) in self.columns.iter().enumerate() { + for (col_i, s) in self.materialized_column_iter().enumerate() { match s.dtype() { #[cfg(feature = "object")] DataType::Object(_, _) => { diff --git a/crates/polars-core/src/frame/row/transpose.rs b/crates/polars-core/src/frame/row/transpose.rs index 1984a085116f..0f41bb2749d5 100644 --- a/crates/polars-core/src/frame/row/transpose.rs +++ b/crates/polars-core/src/frame/row/transpose.rs @@ -15,15 +15,15 @@ impl DataFrame { let new_height = self.width(); // Allocate space for the transposed columns, putting the "row names" first if needed let mut cols_t = match keep_names_as { - None => Vec::::with_capacity(new_width), + None => Vec::::with_capacity(new_width), Some(name) => { - let mut tmp = Vec::::with_capacity(new_width + 1); + let mut tmp = Vec::::with_capacity(new_width + 1); tmp.push( StringChunked::from_iter_values( name, self.get_column_names_owned().into_iter(), ) - .into(), + .into_column(), ); tmp }, @@ -60,8 +60,7 @@ impl DataFrame { .collect::>(); let columns = self - .columns - .iter() + .materialized_column_iter() // first cast to supertype before casting to physical to ensure units are correct .map(|s| s.cast(dtype).unwrap().cast(&phys_dtype).unwrap()) .collect::>(); @@ -81,7 +80,7 @@ impl DataFrame { // SAFETY: we are casting back to the supertype let mut s = unsafe { buf.into_series().cast_unchecked(dtype).unwrap() }; s.rename(name.clone()); - s + s.into() })); }, }; @@ -183,9 +182,9 @@ unsafe fn add_value( // This just fills a pre-allocated mutable series vector, which may have a name column. // Nothing is returned and the actual DataFrame is constructed above. pub(super) fn numeric_transpose( - cols: &[Series], + cols: &[Column], names_out: &[PlSmallStr], - cols_t: &mut Vec, + cols_t: &mut Vec, ) where T: PolarsNumericType, //S: AsRef, @@ -211,43 +210,46 @@ pub(super) fn numeric_transpose( let validity_buf_ptr = &mut validity_buf as *mut Vec> as usize; POOL.install(|| { - cols.iter().enumerate().for_each(|(row_idx, s)| { - let s = s.cast(&T::get_dtype()).unwrap(); - let ca = s.unpack::().unwrap(); + cols.iter() + .map(Column::as_materialized_series) + .enumerate() + .for_each(|(row_idx, s)| { + let s = s.cast(&T::get_dtype()).unwrap(); + let ca = s.unpack::().unwrap(); - // SAFETY: - // we access in parallel, but every access is unique, so we don't break aliasing rules - // we also ensured we allocated enough memory, so we never reallocate and thus - // the pointers remain valid. - if has_nulls { - for (col_idx, opt_v) in ca.iter().enumerate() { - match opt_v { - None => unsafe { - let column = (*(validity_buf_ptr as *mut Vec>)) + // SAFETY: + // we access in parallel, but every access is unique, so we don't break aliasing rules + // we also ensured we allocated enough memory, so we never reallocate and thus + // the pointers remain valid. + if has_nulls { + for (col_idx, opt_v) in ca.iter().enumerate() { + match opt_v { + None => unsafe { + let column = (*(validity_buf_ptr as *mut Vec>)) + .get_unchecked_mut(col_idx); + let el_ptr = column.as_mut_ptr(); + *el_ptr.add(row_idx) = false; + // we must initialize this memory otherwise downstream code + // might access uninitialized memory when the masked out values + // are changed. + add_value(values_buf_ptr, col_idx, row_idx, T::Native::default()); + }, + Some(v) => unsafe { + add_value(values_buf_ptr, col_idx, row_idx, v); + }, + } + } + } else { + for (col_idx, v) in ca.into_no_null_iter().enumerate() { + unsafe { + let column = (*(values_buf_ptr as *mut Vec>)) .get_unchecked_mut(col_idx); let el_ptr = column.as_mut_ptr(); - *el_ptr.add(row_idx) = false; - // we must initialize this memory otherwise downstream code - // might access uninitialized memory when the masked out values - // are changed. - add_value(values_buf_ptr, col_idx, row_idx, T::Native::default()); - }, - Some(v) => unsafe { - add_value(values_buf_ptr, col_idx, row_idx, v); - }, - } - } - } else { - for (col_idx, v) in ca.into_no_null_iter().enumerate() { - unsafe { - let column = (*(values_buf_ptr as *mut Vec>)) - .get_unchecked_mut(col_idx); - let el_ptr = column.as_mut_ptr(); - *el_ptr.add(row_idx) = v; + *el_ptr.add(row_idx) = v; + } } } - } - }) + }) }); let par_iter = values_buf @@ -277,7 +279,7 @@ pub(super) fn numeric_transpose( values.into(), validity, ); - ChunkedArray::with_chunk(name.clone(), arr).into_series() + ChunkedArray::with_chunk(name.clone(), arr).into_column() }); POOL.install(|| cols_t.par_extend(par_iter)); } diff --git a/crates/polars-core/src/frame/top_k.rs b/crates/polars-core/src/frame/top_k.rs index af3351d79fba..dd610a2383d4 100644 --- a/crates/polars-core/src/frame/top_k.rs +++ b/crates/polars-core/src/frame/top_k.rs @@ -5,7 +5,7 @@ impl DataFrame { pub(crate) fn bottom_k_impl( &self, k: usize, - by_column: Vec, + by_column: Vec, mut sort_options: SortMultipleOptions, ) -> PolarsResult { let first_descending = sort_options.descending[0]; diff --git a/crates/polars-core/src/frame/upstream_traits.rs b/crates/polars-core/src/frame/upstream_traits.rs index e2f28aefdb33..11bfc88ef196 100644 --- a/crates/polars-core/src/frame/upstream_traits.rs +++ b/crates/polars-core/src/frame/upstream_traits.rs @@ -7,13 +7,13 @@ impl FromIterator for DataFrame { /// /// Panics if Series have different lengths. fn from_iter>(iter: T) -> Self { - let v = iter.into_iter().collect(); + let v = iter.into_iter().map(Column::from).collect(); DataFrame::new(v).expect("could not create DataFrame from iterator") } } impl Index for DataFrame { - type Output = Series; + type Output = Column; fn index(&self, index: usize) -> &Self::Output { &self.columns[index] @@ -23,7 +23,7 @@ impl Index for DataFrame { macro_rules! impl_ranges { ($range_type:ty) => { impl Index<$range_type> for DataFrame { - type Output = [Series]; + type Output = [Column]; fn index(&self, index: $range_type) -> &Self::Output { &self.columns[index] @@ -41,7 +41,7 @@ impl_ranges!(RangeFull); // we don't implement Borrow or AsRef as upstream crates may add impl of trait for usize. impl Index<&str> for DataFrame { - type Output = Series; + type Output = Column; fn index(&self, index: &str) -> &Self::Output { let idx = self.check_name_to_idx(index).unwrap(); diff --git a/crates/polars-core/src/functions.rs b/crates/polars-core/src/functions.rs index 57cbee3a01dc..50ce5d14e491 100644 --- a/crates/polars-core/src/functions.rs +++ b/crates/polars-core/src/functions.rs @@ -35,7 +35,7 @@ pub fn concat_df_diagonal(dfs: &[DataFrame]) -> PolarsResult { for (name, dtype) in &schema { match df.column(name.as_str()).ok() { Some(s) => columns.push(s.clone()), - None => columns.push(Series::full_null(name.clone(), height, dtype)), + None => columns.push(Column::full_null(name.clone(), height, dtype)), } } unsafe { DataFrame::new_no_checks(columns) } diff --git a/crates/polars-core/src/hashing/vector_hasher.rs b/crates/polars-core/src/hashing/vector_hasher.rs index 277c1c009ba0..7dfb07c64d58 100644 --- a/crates/polars-core/src/hashing/vector_hasher.rs +++ b/crates/polars-core/src/hashing/vector_hasher.rs @@ -450,7 +450,7 @@ pub fn _df_rows_to_hashes_threaded_vertical( .map(|df| { let hb = hasher_builder.clone(); let mut hashes = vec![]; - series_to_hashes(df.get_columns(), Some(hb), &mut hashes)?; + columns_to_hashes(df.get_columns(), Some(hb), &mut hashes)?; Ok(UInt64Chunked::from_vec(PlSmallStr::EMPTY, hashes)) }) .collect::>>() @@ -458,8 +458,8 @@ pub fn _df_rows_to_hashes_threaded_vertical( Ok((hashes, hasher_builder)) } -pub(crate) fn series_to_hashes( - keys: &[Series], +pub(crate) fn columns_to_hashes( + keys: &[Column], build_hasher: Option, hashes: &mut Vec, ) -> PolarsResult { diff --git a/crates/polars-core/src/prelude.rs b/crates/polars-core/src/prelude.rs index 996c9b83c5c5..f885ebac5d18 100644 --- a/crates/polars-core/src/prelude.rs +++ b/crates/polars-core/src/prelude.rs @@ -45,7 +45,7 @@ pub use crate::frame::explode::UnpivotArgsIR; pub(crate) use crate::frame::group_by::aggregations::*; #[cfg(feature = "algorithm_group_by")] pub use crate::frame::group_by::*; -pub use crate::frame::{DataFrame, UniqueKeepStrategy}; +pub use crate::frame::{DataFrame, Column, UniqueKeepStrategy}; pub use crate::hashing::VecHash; pub use crate::named_from::{NamedFrom, NamedFromOwned}; pub use crate::scalar::Scalar; diff --git a/crates/polars-core/src/serde/df.rs b/crates/polars-core/src/serde/df.rs index 31d1934504f5..677f455d552a 100644 --- a/crates/polars-core/src/serde/df.rs +++ b/crates/polars-core/src/serde/df.rs @@ -2,7 +2,7 @@ use polars_error::PolarsError; use serde::de::Error; use serde::*; -use crate::prelude::{DataFrame, Series}; +use crate::prelude::{DataFrame, Column}; // utility to ensure we serde to a struct // { @@ -12,12 +12,12 @@ use crate::prelude::{DataFrame, Series}; // and is backwards compatible #[derive(Deserialize)] struct Util { - columns: Vec, + columns: Vec, } #[derive(Serialize)] struct UtilBorrowed<'a> { - columns: &'a [Series], + columns: &'a [Column], } impl<'de> Deserialize<'de> for DataFrame { diff --git a/crates/polars-core/src/serde/mod.rs b/crates/polars-core/src/serde/mod.rs index 86fbf5c52007..d355f959fd15 100644 --- a/crates/polars-core/src/serde/mod.rs +++ b/crates/polars-core/src/serde/mod.rs @@ -42,9 +42,9 @@ mod test { let s1 = Series::new("foo".into(), &[1, 2, 3]); let s2 = Series::new("bar".into(), &[Some(true), None, Some(false)]); let s3 = Series::new("string".into(), &["mouse", "elephant", "dog"]); - let s_list = Series::new("list".into(), &[s1.clone(), s1.clone(), s1.clone()]); + let s_list = Column::new("list".into(), &[s1.clone(), s1.clone(), s1.clone()]); - DataFrame::new(vec![s1, s2, s3, s_list]).unwrap() + DataFrame::new(vec![s1.into(), s2.into(), s3.into(), s_list]).unwrap() } #[test] @@ -89,7 +89,7 @@ mod test { #[test] fn test_serde_binary_series_owned_bincode() { - let s1 = Series::new( + let s1 = Column::new( "foo".into(), &[ vec![1u8, 2u8, 3u8], @@ -142,7 +142,7 @@ mod test { let s = Series::from_any_values_and_dtype("item".into(), &[row_1, row_2, row_3], &dtype, false) .unwrap(); - let df = DataFrame::new(vec![s]).unwrap(); + let df = DataFrame::new(vec![s.into()]).unwrap(); let df_str = serde_json::to_string(&df).unwrap(); let out = serde_json::from_str::(&df_str).unwrap(); diff --git a/crates/polars-core/src/series/from.rs b/crates/polars-core/src/series/from.rs index ce473a4d60fb..fa2019f0f000 100644 --- a/crates/polars-core/src/series/from.rs +++ b/crates/polars-core/src/series/from.rs @@ -732,6 +732,14 @@ pub unsafe trait IntoSeries { fn into_series(self) -> Series where Self: Sized; + + #[inline(always)] + fn into_column(self) -> Column + where + Self: Sized + { + Column::from(self.into_series()) + } } impl From> for Series diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs index 8cdf326302d1..7c5af2b9ccc7 100644 --- a/crates/polars-core/src/series/implementations/binary.rs +++ b/crates/polars-core/src/series/implementations/binary.rs @@ -88,7 +88,7 @@ impl private::PrivateSeries for SeriesWrap { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/implementations/binary_offset.rs b/crates/polars-core/src/series/implementations/binary_offset.rs index 9ff8cd6704d0..481b5c5bf47e 100644 --- a/crates/polars-core/src/series/implementations/binary_offset.rs +++ b/crates/polars-core/src/series/implementations/binary_offset.rs @@ -54,7 +54,7 @@ impl private::PrivateSeries for SeriesWrap { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs index aae8a5837af8..30c78b95943d 100644 --- a/crates/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -91,7 +91,7 @@ impl private::PrivateSeries for SeriesWrap { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs index 497ff5267d88..4e7b6efe04a2 100644 --- a/crates/polars-core/src/series/implementations/categorical.rs +++ b/crates/polars-core/src/series/implementations/categorical.rs @@ -117,7 +117,7 @@ impl private::PrivateSeries for SeriesWrap { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/implementations/date.rs b/crates/polars-core/src/series/implementations/date.rs index 834449e73992..479478a94530 100644 --- a/crates/polars-core/src/series/implementations/date.rs +++ b/crates/polars-core/src/series/implementations/date.rs @@ -132,7 +132,7 @@ impl private::PrivateSeries for SeriesWrap { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.deref().arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs index a6a5f111d541..b91df29a0a38 100644 --- a/crates/polars-core/src/series/implementations/datetime.rs +++ b/crates/polars-core/src/series/implementations/datetime.rs @@ -130,7 +130,7 @@ impl private::PrivateSeries for SeriesWrap { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.deref().arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/implementations/duration.rs b/crates/polars-core/src/series/implementations/duration.rs index 73d2e4f730fb..13b121aee0ca 100644 --- a/crates/polars-core/src/series/implementations/duration.rs +++ b/crates/polars-core/src/series/implementations/duration.rs @@ -244,7 +244,7 @@ impl private::PrivateSeries for SeriesWrap { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.deref().arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs index cc52d73cdc60..de349c2a22f5 100644 --- a/crates/polars-core/src/series/implementations/floats.rs +++ b/crates/polars-core/src/series/implementations/floats.rs @@ -148,7 +148,7 @@ macro_rules! impl_dyn_series { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs index 3e4e41395b0b..4116df5a42fa 100644 --- a/crates/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -221,7 +221,7 @@ macro_rules! impl_dyn_series { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/implementations/string.rs b/crates/polars-core/src/series/implementations/string.rs index c8d85825e84b..8b64afcd9895 100644 --- a/crates/polars-core/src/series/implementations/string.rs +++ b/crates/polars-core/src/series/implementations/string.rs @@ -87,7 +87,7 @@ impl private::PrivateSeries for SeriesWrap { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/implementations/time.rs b/crates/polars-core/src/series/implementations/time.rs index 3808f7d977af..ed810d34b3f4 100644 --- a/crates/polars-core/src/series/implementations/time.rs +++ b/crates/polars-core/src/series/implementations/time.rs @@ -107,7 +107,7 @@ impl private::PrivateSeries for SeriesWrap { fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], options: &SortMultipleOptions, ) -> PolarsResult { self.0.deref().arg_sort_multiple(by, options) diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index a629a8fd1c5c..e54f7fcc98d6 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -255,7 +255,7 @@ impl Series { pub fn into_frame(self) -> DataFrame { // SAFETY: A single-column dataframe cannot have length mismatches or duplicate names - unsafe { DataFrame::new_no_checks(vec![self]) } + unsafe { DataFrame::new_no_checks(vec![self.into()]) } } /// Rename series. diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs index b5b60c5eff33..4e90f9d757d9 100644 --- a/crates/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -163,7 +163,7 @@ pub(crate) mod private { #[allow(unused_variables)] fn arg_sort_multiple( &self, - by: &[Series], + by: &[Column], _options: &SortMultipleOptions, ) -> PolarsResult { polars_bail!(opq = arg_sort_multiple, self._dtype()); diff --git a/crates/polars-core/src/testing.rs b/crates/polars-core/src/testing.rs index bf056b5f7769..f227f2bfe861 100644 --- a/crates/polars-core/src/testing.rs +++ b/crates/polars-core/src/testing.rs @@ -199,8 +199,8 @@ mod test { #[test] fn test_df_equal() { - let a = Series::new("a".into(), [1, 2, 3].as_ref()); - let b = Series::new("b".into(), [1, 2, 3].as_ref()); + let a = Column::new("a".into(), [1, 2, 3].as_ref()); + let b = Column::new("b".into(), [1, 2, 3].as_ref()); let df1 = DataFrame::new(vec![a, b]).unwrap(); assert!(df1.equals(&df1)) diff --git a/crates/polars-core/src/tests.rs b/crates/polars-core/src/tests.rs index e8a8111225b7..b1c042e80a4b 100644 --- a/crates/polars-core/src/tests.rs +++ b/crates/polars-core/src/tests.rs @@ -4,9 +4,9 @@ use crate::prelude::*; fn test_initial_empty_sort() -> PolarsResult<()> { // https://github.com/pola-rs/polars/issues/1396 let data = vec![1.3; 42]; - let mut series = Series::new("data".into(), Vec::::new()); - let series2 = Series::new("data2".into(), data.clone()); - let series3 = Series::new("data3".into(), data); + let mut series = Column::new("data".into(), Vec::::new()); + let series2 = Column::new("data2".into(), data.clone()); + let series3 = Column::new("data3".into(), data); let df = DataFrame::new(vec![series2, series3])?; for column in df.get_columns().iter() { diff --git a/crates/polars-core/src/utils/flatten.rs b/crates/polars-core/src/utils/flatten.rs index 52b1c69ea6d9..b96ce61dab82 100644 --- a/crates/polars-core/src/utils/flatten.rs +++ b/crates/polars-core/src/utils/flatten.rs @@ -15,7 +15,7 @@ pub fn flatten_df_iter(df: &DataFrame) -> impl Iterator + '_ { Series::from_chunks_and_dtype_unchecked(s.name().clone(), vec![arr], s.dtype()) }; out.set_sorted_flag(s.is_sorted_flag()); - out + Column::from(out) }) .collect(); let df = unsafe { DataFrame::new_no_checks(columns) }; diff --git a/crates/polars-core/src/utils/mod.rs b/crates/polars-core/src/utils/mod.rs index a516626e1abb..ebe2faa17918 100644 --- a/crates/polars-core/src/utils/mod.rs +++ b/crates/polars-core/src/utils/mod.rs @@ -141,7 +141,9 @@ impl Container for DataFrame { } fn chunk_lengths(&self) -> impl Iterator { - self.get_columns()[0].chunk_lengths() + // @scalar-correctness? + // This should return a option + self.first_series_column().unwrap().chunk_lengths() } } @@ -684,7 +686,7 @@ macro_rules! apply_method_physical_numeric { macro_rules! df { ($($col_name:expr => $slice:expr), + $(,)?) => { $crate::prelude::DataFrame::new(vec![ - $(<$crate::prelude::Series as $crate::prelude::NamedFrom::<_, _>>::new($col_name.into(), $slice),)+ + $($crate::prelude::Column::from(<$crate::prelude::Series as $crate::prelude::NamedFrom::<_, _>>::new($col_name.into(), $slice)),)+ ]) } } diff --git a/crates/polars-ops/src/chunked_array/top_k.rs b/crates/polars-ops/src/chunked_array/top_k.rs index 9772a5593be0..9487ed17b4c7 100644 --- a/crates/polars-ops/src/chunked_array/top_k.rs +++ b/crates/polars-ops/src/chunked_array/top_k.rs @@ -216,7 +216,7 @@ pub fn top_k(s: &[Series], descending: bool) -> PolarsResult { #[cfg(feature = "dtype-struct")] DataType::Struct(_) => { // Fallback to more generic impl. - top_k_by_impl(k, src, &[src.clone()], vec![descending]) + top_k_by_impl(k, src, &[src.clone().into()], vec![descending]) }, _dt => { macro_rules! dispatch { @@ -229,9 +229,9 @@ pub fn top_k(s: &[Series], descending: bool) -> PolarsResult { } } -pub fn top_k_by(s: &[Series], descending: Vec) -> PolarsResult { +pub fn top_k_by(s: &[Column], descending: Vec) -> PolarsResult { /// Return (k, src, by) - fn extract_parameters(s: &[Series]) -> PolarsResult<(usize, &Series, &[Series])> { + fn extract_parameters(s: &[Column]) -> PolarsResult<(usize, &Series, &[Column])> { let k_s = &s[1]; polars_ensure!( @@ -243,7 +243,7 @@ pub fn top_k_by(s: &[Series], descending: Vec) -> PolarsResult { polars_bail!(ComputeError: "`k` must be set for `top_k`") }; - let src = &s[0]; + let src = &s[0].as_materialized_series(); let by = &s[2..]; @@ -272,7 +272,7 @@ pub fn top_k_by(s: &[Series], descending: Vec) -> PolarsResult { fn top_k_by_impl( k: usize, src: &Series, - by: &[Series], + by: &[Column], descending: Vec, ) -> PolarsResult { if src.is_empty() { diff --git a/crates/polars-ops/src/frame/join/asof/groups.rs b/crates/polars-ops/src/frame/join/asof/groups.rs index 81b05a4b752d..6a7ccb3b76f8 100644 --- a/crates/polars-ops/src/frame/join/asof/groups.rs +++ b/crates/polars-ops/src/frame/join/asof/groups.rs @@ -32,8 +32,8 @@ pub(crate) unsafe fn compare_df_rows2( join_nulls: bool, ) -> bool { for (l, r) in left.get_columns().iter().zip(right.get_columns()) { - let l = l.get_unchecked(left_idx); - let r = r.get_unchecked(right_idx); + let l = l.as_materialized_series().get_unchecked(left_idx); + let r = r.as_materialized_series().get_unchecked(right_idx); if !l.eq_missing(&r, join_nulls) { return false; } @@ -398,8 +398,8 @@ where F: Sync + for<'a> Fn(T::Physical<'a>, T::Physical<'a>) -> bool, { let out = if left_by.width() == 1 { - let left_by_s = left_by.get_columns()[0].to_physical_repr().into_owned(); - let right_by_s = right_by.get_columns()[0].to_physical_repr().into_owned(); + let left_by_s = left_by.get_columns()[0].to_physical_repr(); + let right_by_s = right_by.get_columns()[0].to_physical_repr(); let left_dtype = left_by_s.dtype(); let right_dtype = right_by_s.dtype(); polars_ensure!(left_dtype == right_dtype, @@ -418,8 +418,8 @@ where }, x if x.is_float() => { with_match_physical_float_polars_type!(left_by_s.dtype(), |$T| { - let left_by: &ChunkedArray<$T> = left_by_s.as_ref().as_ref().as_ref(); - let right_by: &ChunkedArray<$T> = right_by_s.as_ref().as_ref().as_ref(); + let left_by: &ChunkedArray<$T> = left_by_s.as_materialized_series().as_ref().as_ref().as_ref(); + let right_by: &ChunkedArray<$T> = right_by_s.as_materialized_series().as_ref().as_ref().as_ref(); asof_join_by_numeric::( left_by, right_by, left_asof, right_asof, filter, )? @@ -648,8 +648,8 @@ pub trait AsofJoinBy: IntoDf { { #[cfg(feature = "dtype-categorical")] _check_categorical_src(l.dtype(), r.dtype())?; - *l = l.to_physical_repr().into_owned(); - *r = r.to_physical_repr().into_owned(); + *l = l.to_physical_repr(); + *r = r.to_physical_repr(); } } @@ -707,8 +707,8 @@ pub trait AsofJoinBy: IntoDf { let self_df = self.to_df(); let left_by = left_by.into_iter().map(|s| s.as_ref().into()).collect(); let right_by = right_by.into_iter().map(|s| s.as_ref().into()).collect(); - let left_key = self_df.column(left_on)?; - let right_key = other.column(right_on)?; + let left_key = self_df.column(left_on)?.as_materialized_series(); + let right_key = other.column(right_on)?.as_materialized_series(); self_df._join_asof_by( other, left_key, right_key, left_by, right_by, strategy, tolerance, None, None, true, ) diff --git a/crates/polars-ops/src/frame/join/general.rs b/crates/polars-ops/src/frame/join/general.rs index 5840b853425c..1420d7b66062 100644 --- a/crates/polars-ops/src/frame/join/general.rs +++ b/crates/polars-ops/src/frame/join/general.rs @@ -1,7 +1,7 @@ use polars_utils::format_pl_smallstr; use super::*; -use crate::series::coalesce_series; +use crate::series::coalesce_columns; pub fn _join_suffix_name(name: &str, suffix: &str) -> PlSmallStr { format_pl_smallstr!("{name}{suffix}") @@ -83,7 +83,7 @@ pub fn _coalesce_full_join( let l = columns[pos_l].clone(); let r = columns[pos_r].clone(); - columns[pos_l] = coalesce_series(&[l, r]).unwrap(); + columns[pos_l] = coalesce_columns(&[l, r]).unwrap(); to_remove.push(pos_r); } // sort in reverse order, so the indexes remain correct if we remove. diff --git a/crates/polars-ops/src/frame/join/iejoin/mod.rs b/crates/polars-ops/src/frame/join/iejoin/mod.rs index cd19f7049abe..5d655ca5ee22 100644 --- a/crates/polars-ops/src/frame/join/iejoin/mod.rs +++ b/crates/polars-ops/src/frame/join/iejoin/mod.rs @@ -205,8 +205,8 @@ where pub(super) fn iejoin_par( left: &DataFrame, right: &DataFrame, - selected_left: Vec, - selected_right: Vec, + selected_left: Vec, + selected_right: Vec, options: &IEJoinOptions, suffix: Option, slice: Option<(i64, usize)>, @@ -221,12 +221,12 @@ pub(super) fn iejoin_par( .with_nulls_last(false) .with_order_descending(l1_descending); - let sl = &selected_left[0]; + let sl = &selected_left[0].as_materialized_series(); let l1_s_l = sl .arg_sort(l1_sort_options) .slice(sl.null_count() as i64, sl.len() - sl.null_count()); - let sr = &selected_right[0]; + let sr = &selected_right[0].as_materialized_series(); let l1_s_r = sr .arg_sort(l1_sort_options) .slice(sr.null_count() as i64, sr.len() - sr.null_count()); @@ -282,11 +282,11 @@ pub(super) fn iejoin_par( ( selected_left .iter() - .map(|s| s.take_unchecked(l_l1_idx)) + .map(|s| s.as_materialized_series().take_unchecked(l_l1_idx).into()) .collect_vec(), selected_right .iter() - .map(|s| s.take_unchecked(r_l1_idx)) + .map(|s| s.as_materialized_series().take_unchecked(r_l1_idx).into()) .collect_vec(), ) }; @@ -342,8 +342,8 @@ pub(super) fn iejoin_par( pub(super) fn iejoin( left: &DataFrame, right: &DataFrame, - selected_left: Vec, - selected_right: Vec, + selected_left: Vec, + selected_right: Vec, options: &IEJoinOptions, suffix: Option, slice: Option<(i64, usize)>, @@ -378,8 +378,8 @@ unsafe fn materialize_join( /// Based on Khayyat et al. 2015, "Lightning Fast and Space Efficient Inequality Joins" /// and extended to work with duplicate values. fn iejoin_tuples( - selected_left: Vec, - selected_right: Vec, + selected_left: Vec, + selected_right: Vec, options: &IEJoinOptions, slice: Option<(i64, usize)>, ) -> PolarsResult<(IdxCa, IdxCa)> { @@ -411,14 +411,14 @@ fn iejoin_tuples( let l1_descending = matches!(op1, InequalityOperator::Gt | InequalityOperator::GtEq); let l2_descending = matches!(op2, InequalityOperator::Lt | InequalityOperator::LtEq); - let mut x = selected_left[0].to_physical_repr().into_owned(); + let mut x = selected_left[0].to_physical_repr(); let left_height = x.len(); x.extend(&selected_right[0].to_physical_repr())?; // Rechunk because we will gather. let x = x.rechunk(); - let mut y = selected_left[1].to_physical_repr().into_owned(); + let mut y = selected_left[1].to_physical_repr(); y.extend(&selected_right[1].to_physical_repr())?; // Rechunk because we will gather. let y = y.rechunk(); @@ -432,7 +432,7 @@ fn iejoin_tuples( .arg_sort(l1_sort_options) .slice(x.null_count() as i64, x.len() - x.null_count()); - let y_ordered_by_x = unsafe { y.take_unchecked(&l1_order) }; + let y_ordered_by_x = unsafe { y.as_materialized_series().take_unchecked(&l1_order) }; let l2_sort_options = SortOptions::default() .with_maintain_order(true) .with_nulls_last(false) @@ -455,7 +455,7 @@ fn iejoin_tuples( l2_order, op1, op2, - x, + x.as_materialized_series().clone(), y_ordered_by_x, left_height ) diff --git a/crates/polars-ops/src/frame/join/merge_sorted.rs b/crates/polars-ops/src/frame/join/merge_sorted.rs index a9f02c2904cd..8ab303fae2c1 100644 --- a/crates/polars-ops/src/frame/join/merge_sorted.rs +++ b/crates/polars-ops/src/frame/join/merge_sorted.rs @@ -36,7 +36,11 @@ pub fn _merge_sorted_dfs( let lhs_phys = lhs.to_physical_repr(); let rhs_phys = rhs.to_physical_repr(); - let out = merge_series(&lhs_phys, &rhs_phys, &merge_indicator)?; + let out = Column::from(merge_series( + lhs_phys.as_materialized_series(), + rhs_phys.as_materialized_series(), + &merge_indicator, + )?); let mut out = out.cast(lhs.dtype()).unwrap(); out.rename(lhs.name().clone()); Ok(out) diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index 89507ac216c5..2cd9bd323690 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -93,8 +93,8 @@ pub trait DataFrameJoinOps: IntoDf { args: JoinArgs, ) -> PolarsResult { let df_left = self.to_df(); - let selected_left = df_left.select_series(left_on)?; - let selected_right = other.select_series(right_on)?; + let selected_left = df_left.select_columns(left_on)?; + let selected_right = other.select_columns(right_on)?; self._join_impl(other, selected_left, selected_right, args, true, false) } @@ -104,8 +104,8 @@ pub trait DataFrameJoinOps: IntoDf { fn _join_impl( &self, other: &DataFrame, - mut selected_left: Vec, - mut selected_right: Vec, + mut selected_left: Vec, + mut selected_right: Vec, mut args: JoinArgs, _check_rechunk: bool, _verbose: bool, @@ -118,7 +118,7 @@ pub trait DataFrameJoinOps: IntoDf { } // Clear literals if a frame is empty. Otherwise we could get an oob - fn clear(s: &mut [Series]) { + fn clear(s: &mut [Column]) { for s in s.iter_mut() { if s.len() == 1 { *s = s.clear() @@ -195,8 +195,8 @@ pub trait DataFrameJoinOps: IntoDf { Err(_) => { let (ca_left, ca_right) = make_categoricals_compatible(l.categorical()?, r.categorical()?)?; - *l = ca_left.into_series().with_name(l.name().clone()); - *r = ca_right.into_series().with_name(r.name().clone()); + *l = ca_left.into_column().with_name(l.name().clone()); + *r = ca_right.into_column().with_name(r.name().clone()); }, } } @@ -222,8 +222,8 @@ pub trait DataFrameJoinOps: IntoDf { // Single keys. if selected_left.len() == 1 { - let s_left = &selected_left[0]; - let s_right = &selected_right[0]; + let s_left = &selected_left[0].as_materialized_series(); + let s_right = &selected_right[0].as_materialized_series(); let drop_names: Option> = if should_coalesce { None } else { Some(vec![]) }; return match args.how { @@ -377,8 +377,8 @@ pub trait DataFrameJoinOps: IntoDf { #[cfg(feature = "semi_anti_join")] JoinType::Anti | JoinType::Semi => self._join_impl( other, - vec![lhs_keys], - vec![rhs_keys], + vec![lhs_keys.into()], + vec![rhs_keys.into()], args, _check_rechunk, _verbose, @@ -513,15 +513,15 @@ trait DataFrameJoinOpsPrivate: IntoDf { impl DataFrameJoinOps for DataFrame {} impl DataFrameJoinOpsPrivate for DataFrame {} -fn prepare_keys_multiple(s: &[Series], join_nulls: bool) -> PolarsResult { +fn prepare_keys_multiple(s: &[Column], join_nulls: bool) -> PolarsResult { let keys = s .iter() .map(|s| { let phys = s.to_physical_repr(); match phys.dtype() { - DataType::Float32 => phys.f32().unwrap().to_canonical().into_series(), - DataType::Float64 => phys.f64().unwrap().to_canonical().into_series(), - _ => phys.into_owned(), + DataType::Float32 => phys.f32().unwrap().to_canonical().into_column(), + DataType::Float64 => phys.f64().unwrap().to_canonical().into_column(), + _ => phys, } }) .collect::>(); diff --git a/crates/polars-ops/src/frame/mod.rs b/crates/polars-ops/src/frame/mod.rs index 5691919c8861..539d4e0cebc1 100644 --- a/crates/polars-ops/src/frame/mod.rs +++ b/crates/polars-ops/src/frame/mod.rs @@ -106,7 +106,7 @@ pub trait DataFrameOps: IntoDf { df.get_columns() .par_iter() .map(|s| match set.contains(s.name().as_str()) { - true => s.to_dummies(separator, drop_first), + true => s.as_materialized_series().to_dummies(separator, drop_first), false => Ok(s.clone().into_frame()), }) .collect::>>() diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index d909b580f87b..d681b5db8a90 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -232,9 +232,16 @@ fn pivot_impl( polars_bail!(ComputeError: "cannot use column name {column} that \ already exists in the DataFrame. Please rename it prior to calling `pivot`.") } - let columns_struct = StructChunked::from_series(column.clone(), fields) - .unwrap() - .into_series(); + // @scalar-opt + let columns_struct = StructChunked::from_series( + column.clone(), + &fields + .iter() + .map(|c| c.as_materialized_series().clone()) + .collect::>(), + ) + .unwrap() + .into_series(); let mut binding = pivot_df.clone(); let pivot_df = unsafe { binding.with_column_unchecked(columns_struct) }; pivot_impl_single_column( @@ -306,13 +313,13 @@ fn pivot_impl_single_column( First => value_col.agg_first(&groups), Mean => value_col.agg_mean(&groups), Median => value_col.agg_median(&groups), - Count => groups.group_count().into_series(), + Count => groups.group_count().into_column(), Expr(ref expr) => { let name = expr.root_name()?.clone(); let mut value_col = value_col.clone(); value_col.rename(name); let tmp_df = value_col.into_frame(); - let mut aggregated = expr.evaluate(&tmp_df, &groups)?; + let mut aggregated = Column::from(expr.evaluate(&tmp_df, &groups)?); aggregated.rename(value_col_name.clone()); aggregated }, @@ -354,7 +361,7 @@ fn pivot_impl_single_column( n_cols, &row_locations, &col_locations, - &value_agg_phys, + value_agg_phys.as_materialized_series(), logical_type, &headers, ) diff --git a/crates/polars-ops/src/frame/pivot/positioning.rs b/crates/polars-ops/src/frame/pivot/positioning.rs index 51761df873b5..9bce6710a1db 100644 --- a/crates/polars-ops/src/frame/pivot/positioning.rs +++ b/crates/polars-ops/src/frame/pivot/positioning.rs @@ -16,7 +16,7 @@ pub(super) fn position_aggregates( value_agg_phys: &Series, logical_type: &DataType, headers: &StringChunked, -) -> Vec { +) -> Vec { let mut buf = vec![AnyValue::Null; n_rows * n_cols]; let start_ptr = buf.as_mut_ptr() as usize; @@ -93,7 +93,7 @@ pub(super) fn position_aggregates( }, _ => Series::from_any_values_and_dtype(name, avs, &phys_type, false).unwrap(), }; - unsafe { out.cast_unchecked(logical_type).unwrap() } + unsafe { out.cast_unchecked(logical_type).unwrap() }.into() }) .collect::>() }) @@ -107,7 +107,7 @@ pub(super) fn position_aggregates_numeric( value_agg_phys: &ChunkedArray, logical_type: &DataType, headers: &StringChunked, -) -> Vec +) -> Vec where T: PolarsNumericType, ChunkedArray: IntoSeries, @@ -172,7 +172,7 @@ where .map(PlSmallStr::from_str) .unwrap_or_else(|| PlSmallStr::from_static("null")); let out = ChunkedArray::::from_slice_options(name, opt_values).into_series(); - unsafe { out.cast_unchecked(logical_type).unwrap() } + unsafe { out.cast_unchecked(logical_type).unwrap() }.into() }) .collect::>() }) @@ -231,7 +231,7 @@ pub(super) fn compute_col_idx( pivot_df: &DataFrame, column: &str, groups: &GroupsProxy, -) -> PolarsResult<(Vec, Series)> { +) -> PolarsResult<(Vec, Column)> { let column_s = pivot_df.column(column)?; let column_agg = unsafe { column_s.agg_first(groups) }; let column_agg_physical = column_agg.to_physical_repr(); @@ -251,11 +251,19 @@ pub(super) fn compute_col_idx( compute_col_idx_numeric(&ca) }, T::Float64 => { - let ca: &ChunkedArray = column_agg_physical.as_ref().as_ref().as_ref(); + let ca: &ChunkedArray = column_agg_physical + .as_materialized_series() + .as_ref() + .as_ref() + .as_ref(); compute_col_idx_numeric(ca) }, T::Float32 => { - let ca: &ChunkedArray = column_agg_physical.as_ref().as_ref().as_ref(); + let ca: &ChunkedArray = column_agg_physical + .as_materialized_series() + .as_ref() + .as_ref() + .as_ref(); compute_col_idx_numeric(ca) }, T::Struct(_) => { @@ -280,6 +288,7 @@ pub(super) fn compute_col_idx( let mut col_to_idx = PlHashMap::with_capacity(HASHMAP_INIT_SIZE); let mut idx = 0 as IdxSize; column_agg_physical + .as_materialized_series() .phys_iter() .map(|v| { let idx = *col_to_idx.entry(v).or_insert_with(|| { @@ -301,7 +310,7 @@ fn compute_row_index<'a, T>( index_agg_physical: &'a ChunkedArray, count: usize, logical_type: &DataType, -) -> (Vec, usize, Option>) +) -> (Vec, usize, Option>) where T: PolarsDataType, T::Physical<'a>: TotalHash + TotalEq + Copy + ToTotalOrd, @@ -337,7 +346,7 @@ where .into_series(); s.rename(index[0].clone()); let s = restore_logical_type(&s, logical_type); - Some(vec![s]) + Some(vec![s.into()]) }, _ => None, }; @@ -350,7 +359,7 @@ fn compute_row_index_struct( index_agg: &Series, index_agg_physical: &BinaryOffsetChunked, count: usize, -) -> (Vec, usize, Option>) { +) -> (Vec, usize, Option>) { let mut row_to_idx = PlIndexMap::with_capacity_and_hasher(HASHMAP_INIT_SIZE, Default::default()); let mut idx = 0 as IdxSize; @@ -382,7 +391,7 @@ fn compute_row_index_struct( // 0 and `index_agg.len() - 1`. let mut s = unsafe { index_agg.take_slice_unchecked(&unique_indices) }; s.rename(index[0].clone()); - Some(vec![s]) + Some(vec![s.into()]) }, _ => None, }; @@ -396,7 +405,7 @@ pub(super) fn compute_row_idx( index: &[PlSmallStr], groups: &GroupsProxy, count: usize, -) -> PolarsResult<(Vec, usize, Option>)> { +) -> PolarsResult<(Vec, usize, Option>)> { let (row_locations, n_rows, row_index) = if index.len() == 1 { let index_s = pivot_df.column(&index[0])?; let index_agg = unsafe { index_s.agg_first(groups) }; @@ -417,11 +426,19 @@ pub(super) fn compute_row_idx( compute_row_index(index, &ca, count, index_s.dtype()) }, T::Float64 => { - let ca: &ChunkedArray = index_agg_physical.as_ref().as_ref().as_ref(); + let ca: &ChunkedArray = index_agg_physical + .as_materialized_series() + .as_ref() + .as_ref() + .as_ref(); compute_row_index(index, ca, count, index_s.dtype()) }, T::Float32 => { - let ca: &ChunkedArray = index_agg_physical.as_ref().as_ref().as_ref(); + let ca: &ChunkedArray = index_agg_physical + .as_materialized_series() + .as_ref() + .as_ref() + .as_ref(); compute_row_index(index, ca, count, index_s.dtype()) }, T::Boolean => { @@ -431,7 +448,7 @@ pub(super) fn compute_row_idx( T::Struct(_) => { let ca = index_agg_physical.struct_().unwrap(); let ca = ca.get_row_encoded(Default::default())?; - compute_row_index_struct(index, &index_agg, &ca, count) + compute_row_index_struct(index, index_agg.as_materialized_series(), &ca, count) }, T::String => { let ca = index_agg_physical.str().unwrap(); @@ -442,6 +459,7 @@ pub(super) fn compute_row_idx( PlIndexMap::with_capacity_and_hasher(HASHMAP_INIT_SIZE, Default::default()); let mut idx = 0 as IdxSize; let row_locations = index_agg_physical + .as_materialized_series() .phys_iter() .map(|v| { let idx = *row_to_idx.entry(v).or_insert_with(|| { @@ -460,7 +478,7 @@ pub(super) fn compute_row_idx( row_to_idx.into_iter().map(|(k, _)| k).collect::>(), ); let s = restore_logical_type(&s, index_s.dtype()); - Some(vec![s]) + Some(vec![Column::from(s)]) }, _ => None, }; @@ -470,9 +488,14 @@ pub(super) fn compute_row_idx( } } else { let binding = pivot_df.select(index.iter().cloned())?; + // @scalar-opt let fields = binding.get_columns(); + let fields = fields + .iter() + .map(|c| c.as_materialized_series().clone()) + .collect::>(); let index_struct_series = - StructChunked::from_series(PlSmallStr::from_static("placeholder"), fields)? + StructChunked::from_series(PlSmallStr::from_static("placeholder"), &fields)? .into_series(); let index_agg = unsafe { index_struct_series.agg_first(groups) }; let index_agg_physical = index_agg.to_physical_repr(); @@ -486,7 +509,8 @@ pub(super) fn compute_row_idx( polars_ensure!(ca.null_count() == 0, InvalidOperation: "outer nullability in struct pivot not yet supported"); - Ok(ca.fields_as_series()) + // @scalar-opt + Ok(ca.fields_as_series().into_iter().map(Column::from).collect()) }).transpose()?; (row_locations, n_rows, row_index) }; diff --git a/crates/polars-ops/src/frame/pivot/unpivot.rs b/crates/polars-ops/src/frame/pivot/unpivot.rs index a9255bdede0e..ea1bd83c2791 100644 --- a/crates/polars-ops/src/frame/pivot/unpivot.rs +++ b/crates/polars-ops/src/frame/pivot/unpivot.rs @@ -1,7 +1,7 @@ use arrow::array::{MutableArray, MutablePlString}; use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked; use polars_core::datatypes::{DataType, PlSmallStr}; -use polars_core::frame::DataFrame; +use polars_core::frame::{Column, DataFrame}; use polars_core::prelude::{IntoVec, Series, UnpivotArgsIR}; use polars_core::utils::try_get_supertype; use polars_error::{polars_err, PolarsResult}; @@ -96,8 +96,8 @@ pub trait UnpivotDF: IntoDf { if self_.get_columns().is_empty() { return DataFrame::new(vec![ - Series::new_empty(variable_name, &DataType::String), - Series::new_empty(value_name, &DataType::Null), + Column::new_empty(variable_name, &DataType::String), + Column::new_empty(value_name, &DataType::Null), ]); } @@ -107,8 +107,8 @@ pub trait UnpivotDF: IntoDf { if on.is_empty() { // return empty frame if there are no columns available to use as value vars if index.len() == self_.width() { - let variable_col = Series::new_empty(variable_name, &DataType::String); - let value_col = Series::new_empty(value_name, &DataType::Null); + let variable_col = Column::new_empty(variable_name, &DataType::String); + let value_col = Column::new_empty(value_name, &DataType::Null); let mut out = self_.select(index).unwrap().clear().take_columns(); out.push(variable_col); @@ -167,13 +167,13 @@ pub trait UnpivotDF: IntoDf { let value_col = col.cast(&st).map_err( |_| polars_err!(InvalidOperation: "'unpivot' not supported for dtype: {}", col.dtype()), )?; - values.extend_from_slice(value_col.chunks()) + values.extend_from_slice(value_col.as_materialized_series().chunks()) } let values_arr = concatenate_owned_unchecked(&values)?; // SAFETY: // The give dtype is correct let values = - unsafe { Series::from_chunks_and_dtype_unchecked(value_name, vec![values_arr], &st) }; + unsafe { Series::from_chunks_and_dtype_unchecked(value_name, vec![values_arr], &st) }.into(); let variable_col = variable_col.as_box(); // SAFETY: @@ -184,7 +184,7 @@ pub trait UnpivotDF: IntoDf { vec![variable_col], &DataType::String, ) - }; + }.into(); ids.hstack_mut(&[variables, values])?; diff --git a/crates/polars-ops/src/series/ops/horizontal.rs b/crates/polars-ops/src/series/ops/horizontal.rs index 4412e2aa21d1..53a392f920df 100644 --- a/crates/polars-ops/src/series/ops/horizontal.rs +++ b/crates/polars-ops/src/series/ops/horizontal.rs @@ -1,31 +1,35 @@ use polars_core::frame::NullStrategy; use polars_core::prelude::*; -pub fn max_horizontal(s: &[Series]) -> PolarsResult> { +pub fn max_horizontal(s: &[Column]) -> PolarsResult> { let df = unsafe { DataFrame::new_no_checks(Vec::from(s)) }; df.max_horizontal() + .map(|s| s.map(Column::from)) .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } -pub fn min_horizontal(s: &[Series]) -> PolarsResult> { +pub fn min_horizontal(s: &[Column]) -> PolarsResult> { let df = unsafe { DataFrame::new_no_checks(Vec::from(s)) }; df.min_horizontal() + .map(|s| s.map(Column::from)) .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } -pub fn sum_horizontal(s: &[Series]) -> PolarsResult> { +pub fn sum_horizontal(s: &[Column]) -> PolarsResult> { let df = unsafe { DataFrame::new_no_checks(Vec::from(s)) }; df.sum_horizontal(NullStrategy::Ignore) + .map(|s| s.map(Column::from)) .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } -pub fn mean_horizontal(s: &[Series]) -> PolarsResult> { +pub fn mean_horizontal(s: &[Column]) -> PolarsResult> { let df = unsafe { DataFrame::new_no_checks(Vec::from(s)) }; df.mean_horizontal(NullStrategy::Ignore) + .map(|s| s.map(Column::from)) .map(|opt_s| opt_s.map(|res| res.with_name(s[0].name().clone()))) } -pub fn coalesce_series(s: &[Series]) -> PolarsResult { +pub fn coalesce_columns(s: &[Column]) -> PolarsResult { // TODO! this can be faster if we have more than two inputs. polars_ensure!(!s.is_empty(), NoData: "cannot coalesce empty list"); let mut out = s[0].clone(); @@ -34,7 +38,10 @@ pub fn coalesce_series(s: &[Series]) -> PolarsResult { return Ok(out); } else { let mask = out.is_not_null(); - out = out.zip_with_same_type(&mask, s)?; + out = out + .as_materialized_series() + .zip_with_same_type(&mask, s.as_materialized_series())? + .into(); } } Ok(out) diff --git a/crates/polars-ops/src/series/ops/replace.rs b/crates/polars-ops/src/series/ops/replace.rs index ff9f8f18760d..9dde258a86aa 100644 --- a/crates/polars-ops/src/series/ops/replace.rs +++ b/crates/polars-ops/src/series/ops/replace.rs @@ -179,7 +179,7 @@ fn replace_by_multiple( }, )?; - let replaced = joined.column("__POLARS_REPLACE_NEW").unwrap(); + let replaced = joined.column("__POLARS_REPLACE_NEW").unwrap().as_materialized_series(); if replaced.null_count() == 0 { return Ok(replaced.clone()); @@ -226,7 +226,7 @@ fn replace_by_multiple_strict(s: &Series, old: Series, new: Series) -> PolarsRes .unwrap(); ensure_all_replaced(mask, s, old_has_null, false)?; - Ok(replaced.clone()) + Ok(replaced.as_materialized_series().clone()) } // Build replacer dataframe. @@ -235,11 +235,12 @@ fn create_replacer(mut old: Series, mut new: Series, add_mask: bool) -> PolarsRe new.rename(PlSmallStr::from_static("__POLARS_REPLACE_NEW")); let cols = if add_mask { - let mask = Series::new(PlSmallStr::from_static("__POLARS_REPLACE_MASK"), &[true]) + // @scalar-opt + let mask = Column::new(PlSmallStr::from_static("__POLARS_REPLACE_MASK"), &[true]) .new_from_index(0, new.len()); - vec![old, new, mask] + vec![old.into(), new.into(), mask.into()] } else { - vec![old, new] + vec![old.into(), new.into()] }; let out = unsafe { DataFrame::new_no_checks(cols) }; Ok(out) diff --git a/crates/polars-ops/src/series/ops/to_dummies.rs b/crates/polars-ops/src/series/ops/to_dummies.rs index 3cd9d426ac1d..437f49dad480 100644 --- a/crates/polars-ops/src/series/ops/to_dummies.rs +++ b/crates/polars-ops/src/series/ops/to_dummies.rs @@ -42,7 +42,7 @@ impl ToDummies for Series { dummies_helper_slice(offset, len, self.len(), name) }, }; - ca.into_series() + ca.into_column() }) .collect(); @@ -77,7 +77,7 @@ fn dummies_helper_slice( ChunkedArray::from_vec(name, av) } -fn sort_columns(mut columns: Vec) -> Vec { +fn sort_columns(mut columns: Vec) -> Vec { columns.sort_by(|a, b| a.name().partial_cmp(b.name()).unwrap()); columns } diff --git a/crates/polars-ops/src/series/ops/various.rs b/crates/polars-ops/src/series/ops/various.rs index 9ad21ab617d3..c29fcc431c98 100644 --- a/crates/polars-ops/src/series/ops/various.rs +++ b/crates/polars-ops/src/series/ops/various.rs @@ -27,19 +27,19 @@ pub trait SeriesMethods: SeriesSealed { ); // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined let groups = s.group_tuples(parallel, sort)?; - let values = unsafe { s.agg_first(&groups) }; + let values = unsafe { s.agg_first(&groups) }.into(); let counts = groups.group_count().with_name(name.clone()); let counts = if normalize { let len = s.len() as f64; let counts: Float64Chunked = unary_elementwise_values(&counts, |count| count as f64 / len); - counts.into_series() + counts.into_column() } else { - counts.into_series() + counts.into_column() }; - let cols = vec![values, counts.into_series()]; + let cols = vec![values, counts]; let df = unsafe { DataFrame::new_no_checks(cols) }; if sort { df.sort( @@ -95,7 +95,7 @@ pub trait SeriesMethods: SeriesSealed { if matches!(s.dtype(), DataType::Struct(_)) { let encoded = _get_rows_encoded_ca( PlSmallStr::EMPTY, - &[s.clone()], + &[s.clone().into()], &[options.descending], &[options.nulls_last], )?; diff --git a/crates/polars-plan/src/dsl/function_expr/fill_null.rs b/crates/polars-plan/src/dsl/function_expr/fill_null.rs index f4d89f203226..0cbe4cb87c9b 100644 --- a/crates/polars-plan/src/dsl/function_expr/fill_null.rs +++ b/crates/polars-plan/src/dsl/function_expr/fill_null.rs @@ -47,5 +47,5 @@ pub(super) fn fill_null(s: &[Series]) -> PolarsResult { } pub(super) fn coalesce(s: &mut [Series]) -> PolarsResult { - coalesce_series(s) + coalesce_columns(s) } From b027465f866c46f883c456717a259c461ad3ba06 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 13:28:39 +0200 Subject: [PATCH 02/42] finish polars-plan --- .../src/chunked_array/struct_/mod.rs | 15 +- crates/polars-core/src/frame/column.rs | 938 ++++++++++++++++++ crates/polars-core/src/frame/mod.rs | 553 +---------- crates/polars-core/src/prelude.rs | 3 +- crates/polars-core/src/series/from.rs | 8 - crates/polars-core/src/utils/mod.rs | 8 +- crates/polars-expr/src/expressions/apply.rs | 8 +- crates/polars-expr/src/planner.rs | 2 +- crates/polars-ffi/src/version_0.rs | 6 +- crates/polars-io/src/csv/read/read_impl.rs | 28 +- crates/polars-io/src/csv/read/reader.rs | 16 +- crates/polars-io/src/csv/write/write_impl.rs | 6 +- crates/polars-io/src/hive.rs | 4 +- crates/polars-io/src/ndjson/core.rs | 2 +- .../polars-io/src/parquet/read/read_impl.rs | 21 +- crates/polars-io/src/shared.rs | 5 +- crates/polars-io/src/utils/other.rs | 1 + .../src/chunked_array/list/namespace.rs | 6 +- crates/polars-ops/src/chunked_array/mode.rs | 24 +- .../src/chunked_array/strings/namespace.rs | 12 +- crates/polars-ops/src/chunked_array/top_k.rs | 28 +- crates/polars-ops/src/frame/pivot/unpivot.rs | 9 +- crates/polars-ops/src/series/ops/abs.rs | 7 +- crates/polars-ops/src/series/ops/cut.rs | 22 +- crates/polars-ops/src/series/ops/duration.rs | 14 +- crates/polars-ops/src/series/ops/fused.rs | 44 +- .../ops/interpolation/interpolate_by.rs | 10 +- crates/polars-ops/src/series/ops/not.rs | 4 +- crates/polars-ops/src/series/ops/rle.rs | 18 +- crates/polars-plan/src/dsl/array.rs | 2 +- crates/polars-plan/src/dsl/expr.rs | 2 +- crates/polars-plan/src/dsl/expr_dyn_fn.rs | 36 +- .../polars-plan/src/dsl/function_expr/abs.rs | 2 +- .../src/dsl/function_expr/arg_where.rs | 6 +- .../src/dsl/function_expr/array.rs | 82 +- .../src/dsl/function_expr/binary.rs | 34 +- .../src/dsl/function_expr/boolean.rs | 84 +- .../src/dsl/function_expr/bounds.rs | 46 +- .../src/dsl/function_expr/business.rs | 29 +- .../polars-plan/src/dsl/function_expr/cat.rs | 6 +- .../polars-plan/src/dsl/function_expr/clip.rs | 19 +- .../src/dsl/function_expr/coerce.rs | 4 +- .../src/dsl/function_expr/concat.rs | 2 +- .../src/dsl/function_expr/correlation.rs | 32 +- .../polars-plan/src/dsl/function_expr/cum.rs | 25 +- .../src/dsl/function_expr/datetime.rs | 269 +++-- .../src/dsl/function_expr/dispatch.rs | 163 +-- .../polars-plan/src/dsl/function_expr/ewm.rs | 12 +- .../src/dsl/function_expr/ewm_by.rs | 14 +- .../src/dsl/function_expr/fill_null.rs | 11 +- .../src/dsl/function_expr/fused.rs | 8 +- .../polars-plan/src/dsl/function_expr/list.rs | 155 +-- .../polars-plan/src/dsl/function_expr/log.rs | 20 +- .../polars-plan/src/dsl/function_expr/mod.rs | 38 +- .../polars-plan/src/dsl/function_expr/nan.rs | 6 +- .../src/dsl/function_expr/peaks.rs | 14 +- .../src/dsl/function_expr/plugin.rs | 9 +- .../polars-plan/src/dsl/function_expr/pow.rs | 54 +- .../src/dsl/function_expr/random.rs | 14 +- .../src/dsl/function_expr/range/date_range.rs | 10 +- .../dsl/function_expr/range/datetime_range.rs | 20 +- .../src/dsl/function_expr/range/int_range.rs | 10 +- .../src/dsl/function_expr/range/mod.rs | 4 +- .../src/dsl/function_expr/range/time_range.rs | 12 +- .../src/dsl/function_expr/range/utils.rs | 16 +- .../src/dsl/function_expr/rolling.rs | 58 +- .../src/dsl/function_expr/rolling_by.rs | 63 +- .../src/dsl/function_expr/round.rs | 22 +- .../src/dsl/function_expr/row_hash.rs | 8 +- .../src/dsl/function_expr/search_sorted.rs | 10 +- .../src/dsl/function_expr/shift_and_fill.rs | 18 +- .../src/dsl/function_expr/shrink_type.rs | 11 +- .../polars-plan/src/dsl/function_expr/sign.rs | 9 +- .../src/dsl/function_expr/strings.rs | 226 ++--- .../src/dsl/function_expr/struct_.rs | 28 +- .../src/dsl/function_expr/temporal.rs | 12 +- .../src/dsl/function_expr/trigonometry.rs | 120 +-- .../src/dsl/function_expr/unique.rs | 2 +- crates/polars-plan/src/dsl/functions/arity.rs | 10 +- .../src/dsl/functions/horizontal.rs | 66 +- .../polars-plan/src/dsl/functions/repeat.rs | 2 +- crates/polars-plan/src/dsl/list.rs | 2 +- crates/polars-plan/src/dsl/mod.rs | 85 +- crates/polars-plan/src/dsl/name.rs | 2 +- crates/polars-plan/src/dsl/python_udf.rs | 11 +- crates/polars-plan/src/dsl/udf.rs | 6 +- crates/polars-plan/src/plans/aexpr/mod.rs | 2 +- .../polars-plan/src/plans/functions/count.rs | 2 +- .../src/plans/functions/merge_sorted.rs | 14 +- crates/polars-time/src/group_by/dynamic.rs | 88 +- crates/polars-time/src/upsample.rs | 4 +- 91 files changed, 2324 insertions(+), 1649 deletions(-) create mode 100644 crates/polars-core/src/frame/column.rs diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index 95e893bbe73e..e635be7f8f13 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -62,6 +62,15 @@ fn constructor(name: PlSmallStr, fields: &[Series]) -> PolarsResult PolarsResult { + // @scalar-opt! + let series = fields + .iter() + .map(|c| c.as_materialized_series().clone()) + .collect::>(); + Self::from_series(name, &series) + } + pub fn from_series(name: PlSmallStr, fields: &[Series]) -> PolarsResult { let mut names = PlHashSet::with_capacity(fields.len()); let first_len = fields.first().map(|s| s.len()).unwrap_or(0); @@ -347,7 +356,11 @@ impl StructChunked { pub fn unnest(self) -> DataFrame { // @scalar-opt - let columns = self.fields_as_series().into_iter().map(Column::from).collect(); + let columns = self + .fields_as_series() + .into_iter() + .map(Column::from) + .collect(); // SAFETY: invariants for struct are the same unsafe { DataFrame::new_no_checks(columns) } diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs new file mode 100644 index 000000000000..865222bbbda9 --- /dev/null +++ b/crates/polars-core/src/frame/column.rs @@ -0,0 +1,938 @@ +use std::borrow::Cow; +use std::ops::{Add, Div, Mul, Rem, Sub}; +use std::sync::OnceLock; + +use num_traits::{Num, NumCast}; +use polars_error::PolarsResult; +use polars_utils::pl_str::PlSmallStr; + +use crate::chunked_array::metadata::MetadataFlags; +use crate::prelude::*; +use crate::series::{BitRepr, IsSorted}; + +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] +pub enum Column { + Series(Series), + Scalar(ScalarColumn), +} + +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] +pub struct ScalarColumn { + name: PlSmallStr, + value: AnyValue<'static>, + // invariant: Series.len() == length + #[cfg_attr(feature = "serde", serde(skip))] + materialized: OnceLock, + length: usize, +} + +pub trait IntoColumn: Sized { + fn into_column(self) -> Column; +} + +impl Column { + #[inline] + pub fn new(name: PlSmallStr, values: T) -> Self + where + Phantom: ?Sized, + Series: NamedFrom, + { + Self::Series(NamedFrom::new(name, values)) + } + + #[inline] + pub fn new_empty(name: PlSmallStr, dtype: &DataType) -> Self { + // @scalar-opt + Self::Series(Series::new_empty(name, &dtype)) + } + + #[inline] + pub fn new_scalar(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Self { + Self::Scalar(ScalarColumn::new(name, value, length)) + } + + #[inline] + pub fn as_materialized_series(&self) -> &Series { + match self { + Column::Series(s) => s, + Column::Scalar(s) => s.as_materialized_series(), + } + } + + #[inline] + pub fn as_materialized_series_mut(&mut self) -> &mut Series { + match self { + Column::Series(s) => s, + Column::Scalar(s) => { + *self = Column::Series(s.to_series()); + let Column::Series(s) = self else { + unreachable!(); + }; + s + }, + } + } + + #[inline] + pub fn dtype(&self) -> &DataType { + // @scalar-opt + self.as_materialized_series().dtype() + } + + #[inline] + pub fn field(&self) -> Cow { + // @scalar-opt + self.as_materialized_series().field() + } + + #[inline] + pub fn as_series(&self) -> Option<&Series> { + match self { + Column::Series(s) => Some(s), + Column::Scalar(_) => None, + } + } + + #[inline] + pub fn as_scalar_column(&self) -> Option<&ScalarColumn> { + match self { + Column::Series(_) => None, + Column::Scalar(s) => Some(s), + } + } + + pub fn i8(&self) -> PolarsResult<&Int8Chunked> { + // @scalar-opt + self.as_materialized_series().i8() + } + + pub fn i16(&self) -> PolarsResult<&Int16Chunked> { + // @scalar-opt + self.as_materialized_series().i16() + } + + pub fn i32(&self) -> PolarsResult<&Int32Chunked> { + // @scalar-opt + self.as_materialized_series().i32() + } + + pub fn i64(&self) -> PolarsResult<&Int64Chunked> { + // @scalar-opt + self.as_materialized_series().i64() + } + + pub fn u8(&self) -> PolarsResult<&UInt8Chunked> { + // @scalar-opt + self.as_materialized_series().u8() + } + + pub fn u16(&self) -> PolarsResult<&UInt16Chunked> { + // @scalar-opt + self.as_materialized_series().u16() + } + + pub fn u32(&self) -> PolarsResult<&UInt32Chunked> { + // @scalar-opt + self.as_materialized_series().u32() + } + + pub fn u64(&self) -> PolarsResult<&UInt64Chunked> { + // @scalar-opt + self.as_materialized_series().u64() + } + + pub fn f32(&self) -> PolarsResult<&Float32Chunked> { + // @scalar-opt + self.as_materialized_series().f32() + } + + pub fn f64(&self) -> PolarsResult<&Float64Chunked> { + // @scalar-opt + self.as_materialized_series().f64() + } + + pub fn str(&self) -> PolarsResult<&StringChunked> { + // @scalar-opt + self.as_materialized_series().str() + } + + pub fn datetime(&self) -> PolarsResult<&DatetimeChunked> { + // @scalar-opt + self.as_materialized_series().datetime() + } + + #[inline] + pub fn rename(&mut self, name: PlSmallStr) { + match self { + Column::Series(s) => _ = s.rename(name), + Column::Scalar(s) => { + if let Some(series) = s.materialized.get_mut() { + series.rename(name.clone()); + } + + s.name = name; + }, + } + } + + pub fn clear(&self) -> Self { + match self { + Column::Series(s) => s.clear().into(), + Column::Scalar(s) => Self::new_scalar(s.name.clone(), s.value.clone(), 0), + } + } + + #[inline] + pub fn shrink_to_fit(&mut self) { + match self { + Column::Series(s) => s.shrink_to_fit(), + Column::Scalar(_) => {}, + } + } + + #[inline] + pub fn new_from_index(&self, index: usize, length: usize) -> Self { + // @scalar-opt + Self::Series(self.as_materialized_series().new_from_index(index, length)) + } + + #[inline] + pub fn len(&self) -> usize { + match self { + Column::Series(s) => s.len(), + Column::Scalar(s) => s.length, + } + } + + #[inline] + pub fn name(&self) -> &PlSmallStr { + match self { + Column::Series(s) => s.name(), + Column::Scalar(s) => &s.name, + } + } + + pub fn has_nulls(&self) -> bool { + // @scalar-opt + self.as_materialized_series().has_nulls() + } + + pub fn is_not_null(&self) -> ChunkedArray { + // @scalar-opt + self.as_materialized_series().is_not_null() + } + + pub fn to_physical_repr(&self) -> Column { + // @scalar-opt + self.as_materialized_series() + .to_physical_repr() + .into_owned() + .into() + } + + pub fn head(&self, length: Option) -> Column { + // @scalar-opt + self.as_materialized_series().head(length).into() + } + + pub fn tail(&self, length: Option) -> Column { + // @scalar-opt + self.as_materialized_series().tail(length).into() + } + + pub fn slice(&self, offset: i64, length: usize) -> Column { + // @scalar-opt + self.as_materialized_series().slice(offset, length).into() + } + + pub fn split_at(&self, offset: i64) -> (Column, Column) { + // @scalar-opt + let (l, r) = self.as_materialized_series().split_at(offset); + (l.into(), r.into()) + } + + pub fn null_count(&self) -> usize { + // @scalar-opt + self.as_materialized_series().null_count() + } + + pub unsafe fn agg_min(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_min(groups) }.into() + } + + pub unsafe fn agg_max(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_max(groups) }.into() + } + + pub unsafe fn agg_mean(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_mean(groups) }.into() + } + + pub unsafe fn agg_sum(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_sum(groups) }.into() + } + + pub unsafe fn agg_first(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_first(groups) }.into() + } + + pub unsafe fn agg_last(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_last(groups) }.into() + } + + pub unsafe fn agg_n_unique(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_n_unique(groups) }.into() + } + + pub unsafe fn agg_quantile( + &self, + groups: &GroupsProxy, + quantile: f64, + interpol: QuantileInterpolOptions, + ) -> Self { + // @scalar-opt + unsafe { + self.as_materialized_series() + .agg_quantile(groups, quantile, interpol) + } + .into() + } + + pub unsafe fn agg_median(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_median(groups) }.into() + } + + pub unsafe fn agg_var(&self, groups: &GroupsProxy, ddof: u8) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_var(groups, ddof) }.into() + } + + pub unsafe fn agg_std(&self, groups: &GroupsProxy, ddof: u8) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_std(groups, ddof) }.into() + } + + pub unsafe fn agg_list(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_list(groups) }.into() + } + + pub fn full_null(name: PlSmallStr, size: usize, dtype: &DataType) -> Column { + // @scalar-opt + Series::full_null(name, size, dtype).into() + } + + pub fn is_empty(&self) -> bool { + // @scalar-opt + self.as_materialized_series().is_empty() + } + + pub fn reverse(&self) -> Column { + // @scalar-opt + self.as_materialized_series().reverse().into() + } + + pub fn equals(&self, right: &Column) -> bool { + // @scalar-opt + self.as_materialized_series() + .equals(right.as_materialized_series()) + } + + pub fn equals_missing(&self, right: &Column) -> bool { + // @scalar-opt + self.as_materialized_series() + .equals_missing(right.as_materialized_series()) + } + + pub fn set_sorted_flag(&mut self, sorted: IsSorted) { + // @scalar-opt + match self { + Column::Series(s) => s.set_sorted_flag(sorted), + Column::Scalar(_) => {}, + } + } + + pub fn get_flags(&self) -> MetadataFlags { + match self { + Column::Series(s) => s.get_flags(), + // @scalar-opt + Column::Scalar(_) => MetadataFlags::empty(), + } + } + + pub fn get_data_ptr(&self) -> usize { + // @scalar-opt + self.as_materialized_series().get_data_ptr() + } + + pub fn vec_hash(&self, build_hasher: PlRandomState, buf: &mut Vec) -> PolarsResult<()> { + // @scalar-opt? + self.as_materialized_series().vec_hash(build_hasher, buf) + } + + pub fn vec_hash_combine( + &self, + build_hasher: PlRandomState, + hashes: &mut [u64], + ) -> PolarsResult<()> { + // @scalar-opt? + self.as_materialized_series() + .vec_hash_combine(build_hasher, hashes) + } + + pub(crate) unsafe fn equal_element( + &self, + idx_self: usize, + idx_other: usize, + other: &Column, + ) -> bool { + // @scalar-opt + unsafe { + self.as_materialized_series().equal_element( + idx_self, + idx_other, + other.as_materialized_series(), + ) + } + } + + pub fn categorical(&self) -> PolarsResult<&CategoricalChunked> { + self.as_materialized_series().categorical() + } + + pub fn with_name(self, name: PlSmallStr) -> Column { + match self { + Column::Series(s) => s.with_name(name).into(), + Column::Scalar(s) => s.with_name(name).into(), + } + } + + pub fn append(&mut self, other: &Column) -> PolarsResult<&mut Self> { + // @scalar-opt + self.as_materialized_series_mut() + .append(other.as_materialized_series())?; + Ok(self) + } + + pub fn arg_sort(&self, options: SortOptions) -> IdxCa { + // @scalar-opt + self.as_materialized_series().arg_sort(options) + } + + pub fn cast(&self, dtype: &DataType) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().cast(dtype).map(Column::from) + } + + pub fn idx(&self) -> PolarsResult<&IdxCa> { + // @scalar-opt + self.as_materialized_series().idx() + } + + pub fn binary(&self) -> PolarsResult<&BinaryChunked> { + // @scalar-opt + self.as_materialized_series().binary() + } + + pub fn bit_repr(&self) -> Option { + // @scalar-opt + self.as_materialized_series().bit_repr() + } + + pub fn bool(&self) -> PolarsResult<&BooleanChunked> { + // @scalar-opt + self.as_materialized_series().bool() + } + + pub fn struct_(&self) -> PolarsResult<&StructChunked> { + // @scalar-opt + self.as_materialized_series().struct_() + } + + pub fn into_frame(&self) -> DataFrame { + // @scalar-opt + self.as_materialized_series().clone().into_frame() + } + + pub fn unique_stable(&self) -> PolarsResult { + // @scalar-opt? + self.as_materialized_series() + .unique_stable() + .map(Column::from) + } + + pub fn extend(&mut self, other: &Column) -> PolarsResult<&mut Self> { + // @scalar-opt + self.as_materialized_series_mut() + .extend(other.as_materialized_series())?; + Ok(self) + } + + pub fn rechunk(&self) -> Column { + match self { + Column::Series(s) => s.rechunk().into(), + Column::Scalar(_) => self.clone(), + } + } + + pub fn explode(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().explode().map(Column::from) + } + + pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .fill_null(strategy) + .map(Column::from) + } + + pub fn divide(&self, rhs: &Column) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .divide(rhs.as_materialized_series()) + .map(Column::from) + } + + pub fn shift(&self, periods: i64) -> Column { + // @scalar-opt + self.as_materialized_series().shift(periods).into() + } + + pub fn strict_cast(&self, dtype: &DataType) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .strict_cast(dtype) + .map(Column::from) + } + + pub unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult { + // @scalar-opt + unsafe { self.as_materialized_series().cast_unchecked(dtype) }.map(Column::from) + } + + pub fn zip_with_same_type( + &self, + mask: &ChunkedArray, + other: &Column, + ) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .zip_with_same_type(mask, other.as_materialized_series()) + .map(Column::from) + } + + pub fn drop_nulls(&self) -> Column { + // @scalar-opt + self.as_materialized_series().drop_nulls().into() + } + + pub fn is_sorted_flag(&self) -> IsSorted { + // @scalar-opt + self.as_materialized_series().is_sorted_flag() + } + + pub fn get(&self, index: usize) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().get(index) + } + + pub fn decimal(&self) -> PolarsResult<&DecimalChunked> { + // @scalar-opt + self.as_materialized_series().decimal() + } + + pub fn unique(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().unique().map(Column::from) + } + + pub fn reshape_list(&self, dimensions: &[i64]) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .reshape_list(dimensions) + .map(Self::from) + } + + pub fn reshape_array(&self, dimensions: &[i64]) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .reshape_array(dimensions) + .map(Self::from) + } + + pub fn sort(&self, sort_options: SortOptions) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .sort(sort_options) + .map(Self::from) + } + + pub fn filter(&self, filter: &ChunkedArray) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().filter(filter).map(Self::from) + } + + pub fn shuffle(&self, seed: Option) -> Self { + // @scalar-opt + self.as_materialized_series().shuffle(seed).into() + } + + pub fn sample_frac( + &self, + frac: f64, + with_replacement: bool, + shuffle: bool, + seed: Option, + ) -> PolarsResult { + self.as_materialized_series() + .sample_frac(frac, with_replacement, shuffle, seed) + .map(Self::from) + } + + pub fn sample_n( + &self, + n: usize, + with_replacement: bool, + shuffle: bool, + seed: Option, + ) -> PolarsResult { + self.as_materialized_series() + .sample_n(n, with_replacement, shuffle, seed) + .map(Self::from) + } + + pub fn gather_every(&self, n: usize, offset: usize) -> Column { + // @scalar-opt + self.as_materialized_series().gather_every(n, offset).into() + } + + pub fn extend_constant(&self, value: AnyValue, n: usize) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .extend_constant(value, n) + .map(Self::from) + } + + pub fn array(&self) -> PolarsResult<&ArrayChunked> { + // @scalar-opt + self.as_materialized_series().array() + } + + pub fn list(&self) -> PolarsResult<&ListChunked> { + // @scalar-opt + self.as_materialized_series().list() + } + + pub fn is_null(&self) -> BooleanChunked { + // @scalar-opt + self.as_materialized_series().is_null() + } + + pub fn zip_with(&self, mask: &BooleanChunked, other: &Self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .zip_with(mask, other.as_materialized_series()) + .map(Self::from) + } + + pub fn is_finite(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().is_finite() + } + + pub fn is_infinite(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().is_infinite() + } + + pub fn is_nan(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().is_nan() + } + + pub fn is_not_nan(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().is_not_nan() + } + + pub fn date(&self) -> PolarsResult<&DateChunked> { + // @scalar-opt + self.as_materialized_series().date() + } + + pub fn duration(&self) -> PolarsResult<&DurationChunked> { + // @scalar-opt + self.as_materialized_series().duration() + } + + pub fn wrapping_trunc_div_scalar(&self, rhs: T) -> Self + where + T: Num + NumCast, + { + // @scalar-opt + self.as_materialized_series() + .wrapping_trunc_div_scalar(rhs) + .into() + } + + pub fn product(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().product() + } +} + +impl Default for Column { + fn default() -> Self { + // @scalar-opt + Column::Series(Series::default()) + } +} + +impl PartialEq for Column { + fn eq(&self, other: &Self) -> bool { + // @scalar-opt + self.as_materialized_series() + .eq(other.as_materialized_series()) + } +} + +impl From for Column { + #[inline] + fn from(value: Series) -> Self { + Self::Series(value) + } +} + +impl From for Column { + #[inline] + fn from(value: ScalarColumn) -> Self { + Self::Scalar(value) + } +} + +impl Add for Column { + type Output = PolarsResult; + + fn add(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .add(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Add for &Column { + type Output = PolarsResult; + + fn add(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .add(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Sub for Column { + type Output = PolarsResult; + + fn sub(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .sub(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Sub for &Column { + type Output = PolarsResult; + + fn sub(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .sub(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Sub for &Column +where + T: Num + NumCast, +{ + type Output = Column; + + fn sub(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().sub(rhs).into() + } +} + +impl Sub for Column +where + T: Num + NumCast, +{ + type Output = Self; + + fn sub(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().sub(rhs).into() + } +} + +impl Add for &Column +where + T: Num + NumCast, +{ + type Output = Column; + + fn add(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().add(rhs).into() + } +} + +impl Add for Column +where + T: Num + NumCast, +{ + type Output = Self; + + fn add(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().add(rhs).into() + } +} + +impl Div for &Column +where + T: Num + NumCast, +{ + type Output = Column; + + fn div(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().div(rhs).into() + } +} + +impl Div for Column +where + T: Num + NumCast, +{ + type Output = Self; + + fn div(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().div(rhs).into() + } +} + +impl Mul for &Column +where + T: Num + NumCast, +{ + type Output = Column; + + fn mul(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().mul(rhs).into() + } +} + +impl Mul for Column +where + T: Num + NumCast, +{ + type Output = Self; + + fn mul(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().mul(rhs).into() + } +} + +impl Rem for &Column +where + T: Num + NumCast, +{ + type Output = Column; + + fn rem(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().rem(rhs).into() + } +} + +impl Rem for Column +where + T: Num + NumCast, +{ + type Output = Self; + + fn rem(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().rem(rhs).into() + } +} + +impl ScalarColumn { + #[inline] + pub fn new(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Self { + Self { + name, + value, + materialized: OnceLock::new(), + length, + } + } + + fn _to_series(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Series { + // @TODO: There is probably a better way to do this. + Scalar::new(value.dtype(), value) + .into_series(name) + .new_from_index(0, length) + } + + pub fn to_series(&self) -> Series { + Self::_to_series(self.name.clone(), self.value.clone(), self.length) + } + + pub fn as_materialized_series(&self) -> &Series { + self.materialized.get_or_init(|| self.to_series()) + } + + pub fn select_chunk(&self, _: usize) -> Series { + // @scalar-opt + // @scalar-correctness? + todo!() + } + + fn with_name(self, name: PlSmallStr) -> Self { + // @TODO: Keep materialized somehow? + Self::new(name, self.value, self.length) + } +} + +impl IntoColumn for T { + #[inline] + fn into_column(self) -> Column { + Column::from(self.into_series()) + } +} + +impl IntoColumn for Column { + #[inline(always)] + fn into_column(self) -> Column { + self + } +} diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 1308b4ef1cc5..bf06ebd3e546 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -1,13 +1,11 @@ //! DataFrame module. #[cfg(feature = "zip_with")] use std::borrow::Cow; -use std::sync::OnceLock; use std::{mem, ops}; use polars_utils::itertools::Itertools; use rayon::prelude::*; -use crate::chunked_array::metadata::MetadataFlags; #[cfg(feature = "algorithm_group_by")] use crate::chunked_array::ops::unique::is_unique_helper; use crate::prelude::*; @@ -18,6 +16,7 @@ use crate::utils::{slice_offsets, try_get_supertype, NoNull}; #[cfg(feature = "dataframe_arithmetic")] mod arithmetic; mod chunks; +pub mod column; pub mod explode; mod from; #[cfg(feature = "algorithm_group_by")] @@ -39,7 +38,7 @@ use crate::hashing::_df_rows_to_hashes_threaded_vertical; #[cfg(feature = "zip_with")] use crate::prelude::min_max_binary::min_max_binary_series; use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort}; -use crate::series::{BitRepr, IsSorted}; +use crate::series::IsSorted; use crate::POOL; #[derive(Copy, Clone, Debug)] @@ -174,546 +173,6 @@ pub struct DataFrame { pub(crate) columns: Vec, } -#[derive(Debug, Clone)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] -pub enum Column { - Series(Series), - Scalar(ScalarColumn), -} - -impl PartialEq for Column { - fn eq(&self, other: &Self) -> bool { - // @scalar-opt - self.as_materialized_series() - .eq(other.as_materialized_series()) - } -} - -#[derive(Debug, Clone)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] -pub struct ScalarColumn { - name: PlSmallStr, - value: AnyValue<'static>, - // invariant: Series.len() == length - #[cfg_attr(feature = "serde", serde(skip))] - materialized: OnceLock, - length: usize, -} - -pub trait IntoColumn: Sized { - fn into_column(self) -> Column; -} - -impl IntoColumn for T { - fn into_column(self) -> Column { - IntoSeries::into_column(self) - } -} - -impl IntoColumn for Column { - fn into_column(self) -> Column { - self - } -} - -impl ScalarColumn { - #[inline] - pub fn new(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Self { - Self { - name, - value, - materialized: OnceLock::new(), - length, - } - } - - fn _to_series(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Series { - // @TODO: There is probably a better way to do this. - Scalar::new(value.dtype(), value) - .into_series(name) - .new_from_index(0, length) - } - - pub fn to_series(&self) -> Series { - Self::_to_series(self.name.clone(), self.value.clone(), self.length) - } - - pub fn as_materialized_series(&self) -> &Series { - self.materialized.get_or_init(|| self.to_series()) - } - - pub fn select_chunk(&self, _: usize) -> Series { - // @scalar-opt - // @scalar-correctness? - todo!() - } - - fn with_name(self, name: PlSmallStr) -> Self { - // @TODO: Keep materialized somehow? - Self::new(name, self.value, self.length) - } -} - -impl Column { - #[inline] - pub fn new(name: PlSmallStr, values: T) -> Self - where - Phantom: ?Sized, - Series: NamedFrom, - { - Self::Series(NamedFrom::new(name, values)) - } - - #[inline] - pub fn new_empty(name: PlSmallStr, dtype: &DataType) -> Self { - // @scalar-opt - Self::Series(Series::new_empty(name, &dtype)) - } - - #[inline] - pub fn new_scalar(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Self { - Self::Scalar(ScalarColumn::new(name, value, length)) - } - - #[inline] - pub fn as_materialized_series(&self) -> &Series { - match self { - Column::Series(s) => s, - Column::Scalar(s) => s.as_materialized_series(), - } - } - - #[inline] - pub fn as_materialized_series_mut(&mut self) -> &mut Series { - match self { - Column::Series(s) => s, - Column::Scalar(s) => { - *self = Column::Series(s.to_series()); - let Column::Series(s) = self else { - unreachable!(); - }; - s - }, - } - } - - #[inline] - pub fn dtype(&self) -> &DataType { - // @scalar-opt - self.as_materialized_series().dtype() - } - - #[inline] - pub fn field(&self) -> Cow { - // @scalar-opt - self.as_materialized_series().field() - } - - #[inline] - pub fn as_series(&self) -> Option<&Series> { - match self { - Column::Series(s) => Some(s), - Column::Scalar(_) => None, - } - } - - #[inline] - pub fn as_scalar_column(&self) -> Option<&ScalarColumn> { - match self { - Column::Series(_) => None, - Column::Scalar(s) => Some(s), - } - } - - pub fn i8(&self) -> PolarsResult<&Int8Chunked> { - // @scalar-opt - self.as_materialized_series().i8() - } - - pub fn i16(&self) -> PolarsResult<&Int16Chunked> { - // @scalar-opt - self.as_materialized_series().i16() - } - - pub fn i32(&self) -> PolarsResult<&Int32Chunked> { - // @scalar-opt - self.as_materialized_series().i32() - } - - pub fn i64(&self) -> PolarsResult<&Int64Chunked> { - // @scalar-opt - self.as_materialized_series().i64() - } - - pub fn u8(&self) -> PolarsResult<&UInt8Chunked> { - // @scalar-opt - self.as_materialized_series().u8() - } - - pub fn u16(&self) -> PolarsResult<&UInt16Chunked> { - // @scalar-opt - self.as_materialized_series().u16() - } - - pub fn u32(&self) -> PolarsResult<&UInt32Chunked> { - // @scalar-opt - self.as_materialized_series().u32() - } - - pub fn u64(&self) -> PolarsResult<&UInt64Chunked> { - // @scalar-opt - self.as_materialized_series().u64() - } - - pub fn f32(&self) -> PolarsResult<&Float32Chunked> { - // @scalar-opt - self.as_materialized_series().f32() - } - - pub fn f64(&self) -> PolarsResult<&Float64Chunked> { - // @scalar-opt - self.as_materialized_series().f64() - } - - pub fn str(&self) -> PolarsResult<&StringChunked> { - // @scalar-opt - self.as_materialized_series().str() - } - - #[inline] - pub fn rename(&mut self, name: PlSmallStr) { - match self { - Column::Series(s) => _ = s.rename(name), - Column::Scalar(s) => { - if let Some(series) = s.materialized.get_mut() { - series.rename(name.clone()); - } - - s.name = name; - }, - } - } - - pub fn clear(&self) -> Self { - match self { - Column::Series(s) => s.clear().into(), - Column::Scalar(s) => Self::new_scalar(s.name.clone(), s.value.clone(), 0), - } - } - - #[inline] - pub fn shrink_to_fit(&mut self) { - match self { - Column::Series(s) => s.shrink_to_fit(), - Column::Scalar(_) => {}, - } - } - - #[inline] - pub fn new_from_index(&self, index: usize, length: usize) -> Self { - // @scalar-opt - Self::Series(self.as_materialized_series().new_from_index(index, length)) - } - - #[inline] - pub fn len(&self) -> usize { - match self { - Column::Series(s) => s.len(), - Column::Scalar(s) => s.length, - } - } - - #[inline] - pub fn name(&self) -> &PlSmallStr { - match self { - Column::Series(s) => s.name(), - Column::Scalar(s) => &s.name, - } - } - - pub fn has_nulls(&self) -> bool { - // @scalar-opt - self.as_materialized_series().has_nulls() - } - - pub fn is_not_null(&self) -> ChunkedArray { - // @scalar-opt - self.as_materialized_series().is_not_null() - } - - pub fn to_physical_repr(&self) -> Column { - // @scalar-opt - self.as_materialized_series() - .to_physical_repr() - .into_owned() - .into() - } - - pub fn head(&self, length: Option) -> Column { - // @scalar-opt - self.as_materialized_series().head(length).into() - } - - pub fn tail(&self, length: Option) -> Column { - // @scalar-opt - self.as_materialized_series().tail(length).into() - } - - pub fn slice(&self, offset: i64, length: usize) -> Column { - // @scalar-opt - self.as_materialized_series().slice(offset, length).into() - } - - pub fn split_at(&self, offset: i64) -> (Column, Column) { - // @scalar-opt - let (l, r) = self.as_materialized_series().split_at(offset); - (l.into(), r.into()) - } - - pub fn null_count(&self) -> usize { - // @scalar-opt - self.as_materialized_series().null_count() - } - - pub unsafe fn agg_min(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_min(groups) }.into() - } - - pub unsafe fn agg_max(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_max(groups) }.into() - } - - pub unsafe fn agg_mean(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_mean(groups) }.into() - } - - pub unsafe fn agg_sum(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_sum(groups) }.into() - } - - pub unsafe fn agg_first(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_first(groups) }.into() - } - - pub unsafe fn agg_last(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_last(groups) }.into() - } - - pub unsafe fn agg_n_unique(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_n_unique(groups) }.into() - } - - pub unsafe fn agg_quantile( - &self, - groups: &GroupsProxy, - quantile: f64, - interpol: QuantileInterpolOptions, - ) -> Self { - // @scalar-opt - unsafe { - self.as_materialized_series() - .agg_quantile(groups, quantile, interpol) - } - .into() - } - - pub unsafe fn agg_median(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_median(groups) }.into() - } - - pub unsafe fn agg_var(&self, groups: &GroupsProxy, ddof: u8) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_var(groups, ddof) }.into() - } - - pub unsafe fn agg_std(&self, groups: &GroupsProxy, ddof: u8) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_std(groups, ddof) }.into() - } - - pub unsafe fn agg_list(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_list(groups) }.into() - } - - pub fn full_null(name: PlSmallStr, size: usize, dtype: &DataType) -> Column { - // @scalar-opt - Series::full_null(name, size, dtype).into() - } - - pub fn is_empty(&self) -> bool { - // @scalar-opt - self.as_materialized_series().is_empty() - } - - pub fn reverse(&self) -> Column { - // @scalar-opt - self.as_materialized_series().reverse().into() - } - - pub fn equals(&self, right: &Column) -> bool { - // @scalar-opt - self.as_materialized_series() - .equals(right.as_materialized_series()) - } - - pub fn equals_missing(&self, right: &Column) -> bool { - // @scalar-opt - self.as_materialized_series() - .equals_missing(right.as_materialized_series()) - } - - pub fn set_sorted_flag(&mut self, sorted: IsSorted) { - // @scalar-opt - match self { - Column::Series(s) => s.set_sorted_flag(sorted), - Column::Scalar(_) => {}, - } - } - - pub fn get_flags(&self) -> MetadataFlags { - match self { - Column::Series(s) => s.get_flags(), - // @scalar-opt - Column::Scalar(_) => MetadataFlags::empty(), - } - } - - pub fn get_data_ptr(&self) -> usize { - // @scalar-opt - self.as_materialized_series().get_data_ptr() - } - - pub fn vec_hash(&self, build_hasher: PlRandomState, buf: &mut Vec) -> PolarsResult<()> { - // @scalar-opt? - self.as_materialized_series().vec_hash(build_hasher, buf) - } - - pub fn vec_hash_combine( - &self, - build_hasher: PlRandomState, - hashes: &mut [u64], - ) -> PolarsResult<()> { - // @scalar-opt? - self.as_materialized_series() - .vec_hash_combine(build_hasher, hashes) - } - - pub(crate) unsafe fn equal_element( - &self, - idx_self: usize, - idx_other: usize, - other: &Column, - ) -> bool { - // @scalar-opt - unsafe { - self.as_materialized_series().equal_element( - idx_self, - idx_other, - other.as_materialized_series(), - ) - } - } - - pub fn categorical(&self) -> PolarsResult<&CategoricalChunked> { - self.as_materialized_series().categorical() - } - - pub fn with_name(self, name: PlSmallStr) -> Column { - match self { - Column::Series(s) => s.with_name(name).into(), - Column::Scalar(s) => s.with_name(name).into(), - } - } - - pub(crate) fn append(&mut self, other: &Column) -> PolarsResult<&mut Self> { - // @scalar-opt - self.as_materialized_series_mut() - .append(other.as_materialized_series())?; - Ok(self) - } - - pub fn arg_sort(&self, options: SortOptions) -> IdxCa { - // @scalar-opt - self.as_materialized_series().arg_sort(options) - } - - pub fn cast(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().cast(dtype).map(Column::from) - } - - pub fn idx(&self) -> PolarsResult<&IdxCa> { - // @scalar-opt - self.as_materialized_series().idx() - } - - pub fn binary(&self) -> PolarsResult<&BinaryChunked> { - // @scalar-opt - self.as_materialized_series().binary() - } - - pub fn bit_repr(&self) -> Option { - // @scalar-opt - self.as_materialized_series().bit_repr() - } - - pub fn bool(&self) -> PolarsResult<&BooleanChunked> { - // @scalar-opt - self.as_materialized_series().bool() - } - - pub fn struct_(&self) -> PolarsResult<&StructChunked> { - // @scalar-opt - self.as_materialized_series().struct_() - } - - pub fn into_frame(&self) -> DataFrame { - // @scalar-opt - self.as_materialized_series().clone().into_frame() - } - - pub fn unique_stable(&self) -> PolarsResult { - // @scalar-opt? - self.as_materialized_series().unique_stable().map(Column::from) - } - - pub fn extend(&mut self, other: &Column) -> PolarsResult<&mut Self> { - // @scalar-opt - self.as_materialized_series_mut().extend(other.as_materialized_series())?; - Ok(self) - } - - pub fn rechunk(&self) -> Column { - match self { - Column::Series(s) => s.rechunk().into(), - Column::Scalar(_) => self.clone(), - } - } -} - -impl From for Column { - #[inline] - fn from(value: Series) -> Self { - Self::Series(value) - } -} - -impl From for Column { - #[inline] - fn from(value: ScalarColumn) -> Self { - Self::Scalar(value) - } -} - impl DataFrame { pub fn materialized_column_iter(&self) -> impl Iterator { self.columns.iter().map(Column::as_materialized_series) @@ -1515,7 +974,7 @@ impl DataFrame { let left = left.as_materialized_series_mut(); let right = right.as_materialized_series(); - ensure_can_extend(left, right)?; + ensure_can_extend(&*left, right)?; left.append(right)?; Ok(()) })?; @@ -1569,7 +1028,7 @@ impl DataFrame { let left = left.as_materialized_series_mut(); let right = right.as_materialized_series(); - ensure_can_extend(left, right)?; + ensure_can_extend(&*left, right)?; left.extend(right)?; Ok(()) }) @@ -3062,8 +2521,8 @@ impl DataFrame { 0 => Ok(None), 1 => Ok(Some(self.columns[0].as_materialized_series().clone())), 2 => max_fn( - &self.columns[0].as_materialized_series(), - &self.columns[1].as_materialized_series(), + self.columns[0].as_materialized_series(), + self.columns[1].as_materialized_series(), ) .map(Some), _ => { diff --git a/crates/polars-core/src/prelude.rs b/crates/polars-core/src/prelude.rs index f885ebac5d18..2d729261c287 100644 --- a/crates/polars-core/src/prelude.rs +++ b/crates/polars-core/src/prelude.rs @@ -45,7 +45,8 @@ pub use crate::frame::explode::UnpivotArgsIR; pub(crate) use crate::frame::group_by::aggregations::*; #[cfg(feature = "algorithm_group_by")] pub use crate::frame::group_by::*; -pub use crate::frame::{DataFrame, Column, UniqueKeepStrategy}; +pub use crate::frame::{DataFrame, UniqueKeepStrategy}; +pub use crate::frame::column::{Column, IntoColumn}; pub use crate::hashing::VecHash; pub use crate::named_from::{NamedFrom, NamedFromOwned}; pub use crate::scalar::Scalar; diff --git a/crates/polars-core/src/series/from.rs b/crates/polars-core/src/series/from.rs index fa2019f0f000..ce473a4d60fb 100644 --- a/crates/polars-core/src/series/from.rs +++ b/crates/polars-core/src/series/from.rs @@ -732,14 +732,6 @@ pub unsafe trait IntoSeries { fn into_series(self) -> Series where Self: Sized; - - #[inline(always)] - fn into_column(self) -> Column - where - Self: Sized - { - Column::from(self.into_series()) - } } impl From> for Series diff --git a/crates/polars-core/src/utils/mod.rs b/crates/polars-core/src/utils/mod.rs index ebe2faa17918..fc5c77a65ad6 100644 --- a/crates/polars-core/src/utils/mod.rs +++ b/crates/polars-core/src/utils/mod.rs @@ -1137,10 +1137,10 @@ pub fn coalesce_nulls<'a, T: PolarsDataType>( } } -pub fn coalesce_nulls_series(a: &Series, b: &Series) -> (Series, Series) { +pub fn coalesce_nulls_columns(a: &Column, b: &Column) -> (Column, Column) { if a.null_count() > 0 || b.null_count() > 0 { - let mut a = a.rechunk(); - let mut b = b.rechunk(); + let mut a = a.as_materialized_series().rechunk(); + let mut b = b.as_materialized_series().rechunk(); for (arr_a, arr_b) in unsafe { a.chunks_mut().iter_mut().zip(b.chunks_mut()) } { let validity = match (arr_a.validity(), arr_b.validity()) { (None, Some(b)) => Some(b.clone()), @@ -1153,7 +1153,7 @@ pub fn coalesce_nulls_series(a: &Series, b: &Series) -> (Series, Series) { } a.compute_len(); b.compute_len(); - (a, b) + (a.into(), b.into()) } else { (a.clone(), b.clone()) } diff --git a/crates/polars-expr/src/expressions/apply.rs b/crates/polars-expr/src/expressions/apply.rs index a5ea16d0f22f..6b71b5df4121 100644 --- a/crates/polars-expr/src/expressions/apply.rs +++ b/crates/polars-expr/src/expressions/apply.rs @@ -15,7 +15,7 @@ use crate::expressions::{ pub struct ApplyExpr { inputs: Vec>, - function: SpecialEq>, + function: SpecialEq>, expr: Expr, collect_groups: ApplyOptions, function_returns_scalar: bool, @@ -33,7 +33,7 @@ impl ApplyExpr { #[allow(clippy::too_many_arguments)] pub(crate) fn new( inputs: Vec>, - function: SpecialEq>, + function: SpecialEq>, expr: Expr, options: FunctionOptions, allow_threading: bool, @@ -67,7 +67,7 @@ impl ApplyExpr { pub(crate) fn new_minimal( inputs: Vec>, - function: SpecialEq>, + function: SpecialEq>, expr: Expr, collect_groups: ApplyOptions, ) -> Self { @@ -438,7 +438,7 @@ impl PhysicalExpr for ApplyExpr { fn apply_multiple_elementwise<'a>( mut acs: Vec>, - function: &dyn SeriesUdf, + function: &dyn ColumnsUdf, expr: &Expr, check_lengths: bool, ) -> PolarsResult> { diff --git a/crates/polars-expr/src/planner.rs b/crates/polars-expr/src/planner.rs index e578b8da9679..18cbc222569a 100644 --- a/crates/polars-expr/src/planner.rs +++ b/crates/polars-expr/src/planner.rs @@ -568,7 +568,7 @@ fn create_physical_expr_inner( let input = create_physical_expr_inner(*expr, ctxt, expr_arena, schema, state)?; let function = SpecialEq::new(Arc::new(move |s: &mut [Series]| s[0].explode().map(Some)) - as Arc); + as Arc); Ok(Arc::new(ApplyExpr::new_minimal( vec![input], function, diff --git a/crates/polars-ffi/src/version_0.rs b/crates/polars-ffi/src/version_0.rs index 0fc29055f66d..3cffd4425045 100644 --- a/crates/polars-ffi/src/version_0.rs +++ b/crates/polars-ffi/src/version_0.rs @@ -1,4 +1,4 @@ -use polars_core::prelude::CompatLevel; +use polars_core::prelude::{Column, CompatLevel}; use super::*; @@ -53,6 +53,10 @@ unsafe extern "C" fn c_release_series_export(e: *mut SeriesExport) { e.release = None; } +pub fn export_column(c: &Column) -> SeriesExport { + export_series(c.as_materialized_series()) +} + pub fn export_series(s: &Series) -> SeriesExport { let field = ArrowField::new( s.name().clone(), diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index 048df2dc2fb2..c7d449b77ccb 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -34,22 +34,22 @@ pub(crate) fn cast_columns( parallel: bool, ignore_errors: bool, ) -> PolarsResult<()> { - let cast_fn = |s: &Series, fld: &Field| { - let out = match (s.dtype(), fld.dtype()) { + let cast_fn = |c: &Column, fld: &Field| { + let out = match (c.dtype(), fld.dtype()) { #[cfg(feature = "temporal")] - (DataType::String, DataType::Date) => s + (DataType::String, DataType::Date) => c .str() .unwrap() .as_date(None, false) - .map(|ca| ca.into_series()), + .map(|ca| ca.into_column()), #[cfg(feature = "temporal")] - (DataType::String, DataType::Time) => s + (DataType::String, DataType::Time) => c .str() .unwrap() .as_time(None, false) - .map(|ca| ca.into_series()), + .map(|ca| ca.into_column()), #[cfg(feature = "temporal")] - (DataType::String, DataType::Datetime(tu, _)) => s + (DataType::String, DataType::Datetime(tu, _)) => c .str() .unwrap() .as_datetime( @@ -60,11 +60,11 @@ pub(crate) fn cast_columns( None, &StringChunked::from_iter(std::iter::once("raise")), ) - .map(|ca| ca.into_series()), - (_, dt) => s.cast(dt), + .map(|ca| ca.into_column()), + (_, dt) => c.cast(dt), }?; - if !ignore_errors && s.null_count() != out.null_count() { - handle_casting_failures(s, &out)?; + if !ignore_errors && c.null_count() != out.null_count() { + handle_casting_failures(c.as_materialized_series(), &out.as_materialized_series())?; } Ok(out) }; @@ -554,7 +554,7 @@ impl<'a> CoreReader<'a> { let columns = buffers .into_iter() - .map(|buf| buf.into_series()) + .map(|buf| buf.into_series().map(Column::from)) .collect::>()?; let mut local_df = unsafe { DataFrame::new_no_checks(columns) }; let current_row_count = local_df.height() as IdxSize; @@ -659,7 +659,7 @@ impl<'a> CoreReader<'a> { let columns = buffers .into_iter() - .map(|buf| buf.into_series()) + .map(|buf| buf.into_series().map(Column::from)) .collect::>()?; unsafe { DataFrame::new_no_checks(columns) } }; @@ -766,7 +766,7 @@ fn read_chunk( let columns = buffers .into_iter() - .map(|buf| buf.into_series()) + .map(|buf| buf.into_series().map(Column::from)) .collect::>()?; Ok(unsafe { DataFrame::new_no_checks(columns) }) } diff --git a/crates/polars-io/src/csv/read/reader.rs b/crates/polars-io/src/csv/read/reader.rs index 49fb576fff8a..857b223bae0b 100644 --- a/crates/polars-io/src/csv/read/reader.rs +++ b/crates/polars-io/src/csv/read/reader.rs @@ -325,22 +325,22 @@ fn parse_dates(mut df: DataFrame, fixed_schema: &Schema) -> DataFrame { let cols = unsafe { std::mem::take(df.get_columns_mut()) } .into_par_iter() - .map(|s| { - match s.dtype() { + .map(|c| { + match c.dtype() { DataType::String => { - let ca = s.str().unwrap(); + let ca = c.str().unwrap(); // don't change columns that are in the fixed schema. - if fixed_schema.index_of(s.name()).is_some() { - return s; + if fixed_schema.index_of(c.name()).is_some() { + return c; } #[cfg(feature = "dtype-time")] if let Ok(ca) = ca.as_time(None, false) { - return ca.into_series(); + return ca.into_column(); } - s + c }, - _ => s, + _ => c, } }); let cols = POOL.install(|| cols.collect::>()); diff --git a/crates/polars-io/src/csv/write/write_impl.rs b/crates/polars-io/src/csv/write/write_impl.rs index a3f72b56161f..faeb8d0e449d 100644 --- a/crates/polars-io/src/csv/write/write_impl.rs +++ b/crates/polars-io/src/csv/write/write_impl.rs @@ -140,7 +140,7 @@ pub(crate) fn write( // the bck thinks the lifetime is bounded to write_buffer_pool, but at the time we return // the vectors the buffer pool, the series have already been removed from the buffers // in other words, the lifetime does not leave this scope - let cols = unsafe { std::mem::transmute::<&[Series], &[Series]>(cols) }; + let cols = unsafe { std::mem::transmute::<&[Column], &[Column]>(cols) }; let mut write_buffer = write_buffer_pool.get(); if df.is_empty() { @@ -154,7 +154,7 @@ pub(crate) fn write( .enumerate() .map(|(i, col)| { serializer_for( - &*col.chunks()[0], + &*col.as_materialized_series().chunks()[0], options, col.dtype(), datetime_formats[i], @@ -165,7 +165,7 @@ pub(crate) fn write( } else { debug_assert_eq!(serializers_vec.len(), cols.len()); for (col_iter, col) in std::iter::zip(&mut serializers_vec, cols) { - col_iter.update_array(&*col.chunks()[0]); + col_iter.update_array(&*col.as_materialized_series().chunks()[0]); } } diff --git a/crates/polars-io/src/hive.rs b/crates/polars-io/src/hive.rs index 17ace26d6be7..77e65647fa56 100644 --- a/crates/polars-io/src/hive.rs +++ b/crates/polars-io/src/hive.rs @@ -22,7 +22,9 @@ pub(crate) fn materialize_hive_partitions( return; } - let hive_columns_iter = hive_columns.iter().map(|s| s.new_from_index(0, num_rows)); + let hive_columns_iter = hive_columns + .iter() + .map(|s| s.new_from_index(0, num_rows).into()); if reader_schema.index_of(hive_columns[0].name()).is_none() || df.width() == 0 { // Fast-path - all hive columns are at the end diff --git a/crates/polars-io/src/ndjson/core.rs b/crates/polars-io/src/ndjson/core.rs index c3754f9403d1..a72b4ccf7038 100644 --- a/crates/polars-io/src/ndjson/core.rs +++ b/crates/polars-io/src/ndjson/core.rs @@ -309,7 +309,7 @@ impl<'a> CoreJsonReader<'a> { let mut local_df = DataFrame::new( buffers .into_values() - .map(|buf| buf.into_series()) + .map(|buf| buf.into_series().into_column()) .collect::<_>(), )?; diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index 63fb51464038..fc528267e6f2 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -156,12 +156,14 @@ fn rg_to_dfs( if let Some(row_index) = row_index { let placeholder = NullChunkedBuilder::new(PlSmallStr::from_static("__PL_TMP"), slice.1).finish(); - return Ok(vec![DataFrame::new(vec![placeholder.into_series()])? - .with_row_index( - row_index.name.clone(), - Some(row_index.offset + IdxSize::try_from(slice.0).unwrap()), - )? - .select(std::iter::once(row_index.name))?]); + return Ok(vec![DataFrame::new(vec![placeholder + .into_series() + .into_column()])? + .with_row_index( + row_index.name.clone(), + Some(row_index.offset + IdxSize::try_from(slice.0).unwrap()), + )? + .select(std::iter::once(row_index.name))?]); } } @@ -322,6 +324,7 @@ fn rg_to_dfs_prefiltered( .collect::>(); column_idx_to_series(col_idx, field_md.as_slice(), None, schema, store) + .map(Column::from) }) .collect::>>()?; @@ -515,7 +518,7 @@ fn rg_to_dfs_optionally_par_over_columns( Some(Filter::new_ranged(rg_slice.0, rg_slice.0 + rg_slice.1)), schema, store, - ) + ).map(Column::from) }) .collect::>>() })? @@ -532,7 +535,7 @@ fn rg_to_dfs_optionally_par_over_columns( Some(Filter::new_ranged(rg_slice.0, rg_slice.0 + rg_slice.1)), schema, store, - ) + ).map(Column::from) }) .collect::>>()? }; @@ -632,7 +635,7 @@ fn rg_to_dfs_par_over_rg( Some(Filter::new_ranged(slice.0, slice.0 + slice.1)), schema, store, - ) + ).map(Column::from) }) .collect::>>()?; diff --git a/crates/polars-io/src/shared.rs b/crates/polars-io/src/shared.rs index 7fbb5eb96e7f..4babd4f65bd5 100644 --- a/crates/polars-io/src/shared.rs +++ b/crates/polars-io/src/shared.rs @@ -98,7 +98,10 @@ pub(crate) fn finish_reader( // Create an empty dataframe with the correct data types let empty_cols = arrow_schema .iter_values() - .map(|fld| Series::try_from((fld.name.clone(), new_empty_array(fld.dtype.clone())))) + .map(|fld| { + Series::try_from((fld.name.clone(), new_empty_array(fld.dtype.clone()))) + .map(Column::from) + }) .collect::>()?; DataFrame::new(empty_cols)? } else { diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs index 8999ecb657d4..12e3ee2f9d01 100644 --- a/crates/polars-io/src/utils/other.rs +++ b/crates/polars-io/src/utils/other.rs @@ -203,6 +203,7 @@ pub(crate) fn chunk_df_for_writing( // See: #16403 if !df.get_columns().is_empty() && df.get_columns()[0] + .as_materialized_series() .chunk_lengths() .take(5) .all(|len| len < row_group_size) diff --git a/crates/polars-ops/src/chunked_array/list/namespace.rs b/crates/polars-ops/src/chunked_array/list/namespace.rs index 0c7a0975488c..e16ac5da4453 100644 --- a/crates/polars-ops/src/chunked_array/list/namespace.rs +++ b/crates/polars-ops/src/chunked_array/list/namespace.rs @@ -31,7 +31,7 @@ pub(super) fn has_inner_nulls(ca: &ListChunked) -> bool { } fn cast_rhs( - other: &mut [Series], + other: &mut [Column], inner_type: &DataType, dtype: &DataType, length: usize, @@ -294,7 +294,7 @@ pub trait ListNameSpaceImpl: AsList { ca.try_apply_amortized(|s| diff(s.as_ref(), n, null_behavior)) } - fn lst_shift(&self, periods: &Series) -> PolarsResult { + fn lst_shift(&self, periods: &Column) -> PolarsResult { let ca = self.as_list(); let periods_s = periods.cast(&DataType::Int64)?; let periods = periods_s.i64()?; @@ -584,7 +584,7 @@ pub trait ListNameSpaceImpl: AsList { out.map(|ok| self.same_type(ok)) } - fn lst_concat(&self, other: &[Series]) -> PolarsResult { + fn lst_concat(&self, other: &[Column]) -> PolarsResult { let ca = self.as_list(); let other_len = other.len(); let length = ca.len(); diff --git a/crates/polars-ops/src/chunked_array/mode.rs b/crates/polars-ops/src/chunked_array/mode.rs index a36b161775ca..3c2c3025506f 100644 --- a/crates/polars-ops/src/chunked_array/mode.rs +++ b/crates/polars-ops/src/chunked_array/mode.rs @@ -19,13 +19,15 @@ where } fn mode_f32(ca: &Float32Chunked) -> PolarsResult { - let s = ca.apply_as_ints(|v| mode(v).unwrap()); + // @scalar-opt + let s = ca.apply_as_ints(|v| mode(&v.clone().into()).unwrap().as_materialized_series().clone()); let ca = s.f32().unwrap().clone(); Ok(ca) } fn mode_64(ca: &Float64Chunked) -> PolarsResult { - let s = ca.apply_as_ints(|v| mode(v).unwrap()); + // @scalar-opt + let s = ca.apply_as_ints(|v| mode(&v.clone().into()).unwrap().as_materialized_series().clone()); let ca = s.f64().unwrap().clone(); Ok(ca) } @@ -61,18 +63,18 @@ fn mode_indices(groups: GroupsProxy) -> Vec { } } -pub fn mode(s: &Series) -> PolarsResult { +pub fn mode(s: &Column) -> PolarsResult { let s_phys = s.to_physical_repr(); let out = match s_phys.dtype() { - DataType::Binary => mode_primitive(s_phys.binary().unwrap())?.into_series(), - DataType::Boolean => mode_primitive(s_phys.bool().unwrap())?.into_series(), - DataType::Float32 => mode_f32(s_phys.f32().unwrap())?.into_series(), - DataType::Float64 => mode_64(s_phys.f64().unwrap())?.into_series(), - DataType::String => mode_primitive(&s_phys.str().unwrap().as_binary())?.into_series(), + DataType::Binary => mode_primitive(s_phys.binary().unwrap())?.into_column(), + DataType::Boolean => mode_primitive(s_phys.bool().unwrap())?.into_column(), + DataType::Float32 => mode_f32(s_phys.f32().unwrap())?.into_column(), + DataType::Float64 => mode_64(s_phys.f64().unwrap())?.into_column(), + DataType::String => mode_primitive(&s_phys.str().unwrap().as_binary())?.into_column(), dt if dt.is_integer() => { with_match_physical_integer_polars_type!(dt, |$T| { - let ca: &ChunkedArray<$T> = s_phys.as_ref().as_ref().as_ref(); - mode_primitive(ca)?.into_series() + let ca: &ChunkedArray<$T> = s_phys.as_materialized_series().as_ref().as_ref().as_ref(); + mode_primitive(ca)?.into_column() }) }, _ => polars_bail!(opq = mode, s.dtype()), @@ -120,7 +122,7 @@ mod test { ca_builder.append_value("test2"); ca_builder.append_value("test2"); ca_builder.append_value("test2"); - let s = ca_builder.finish().into_series(); + let s = ca_builder.finish().into_column(); let result = mode(&s).unwrap(); assert_eq!(result.str_value(0).unwrap(), "test2"); assert_eq!(result.len(), 1); diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 1f2899764e4f..07c8fc600fbd 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -418,7 +418,7 @@ pub trait StringNameSpaceImpl: AsString { Ok(builder.finish()) } - fn strip_chars(&self, pat: &Series) -> PolarsResult { + fn strip_chars(&self, pat: &Column) -> PolarsResult { let ca = self.as_string(); if pat.dtype() == &DataType::Null { Ok(unary_elementwise(ca, |opt_s| opt_s.map(|s| s.trim()))) @@ -427,7 +427,7 @@ pub trait StringNameSpaceImpl: AsString { } } - fn strip_chars_start(&self, pat: &Series) -> PolarsResult { + fn strip_chars_start(&self, pat: &Column) -> PolarsResult { let ca = self.as_string(); if pat.dtype() == &DataType::Null { return Ok(unary_elementwise(ca, |opt_s| opt_s.map(|s| s.trim_start()))); @@ -436,7 +436,7 @@ pub trait StringNameSpaceImpl: AsString { } } - fn strip_chars_end(&self, pat: &Series) -> PolarsResult { + fn strip_chars_end(&self, pat: &Column) -> PolarsResult { let ca = self.as_string(); if pat.dtype() == &DataType::Null { return Ok(unary_elementwise(ca, |opt_s| opt_s.map(|s| s.trim_end()))); @@ -609,7 +609,7 @@ pub trait StringNameSpaceImpl: AsString { /// /// Determines a substring starting from `offset` and with length `length` of each of the elements in `array`. /// `offset` can be negative, in which case the start counts from the end of the string. - fn str_slice(&self, offset: &Series, length: &Series) -> PolarsResult { + fn str_slice(&self, offset: &Column, length: &Column) -> PolarsResult { let ca = self.as_string(); let offset = offset.cast(&DataType::Int64)?; // We strict cast, otherwise negative value will be treated as a valid length. @@ -623,7 +623,7 @@ pub trait StringNameSpaceImpl: AsString { /// Determines a substring starting at the beginning of the string up to offset `n` of each /// element in `array`. `n` can be negative, in which case the slice ends `n` characters from /// the end of the string. - fn str_head(&self, n: &Series) -> PolarsResult { + fn str_head(&self, n: &Column) -> PolarsResult { let ca = self.as_string(); let n = n.strict_cast(&DataType::Int64)?; @@ -634,7 +634,7 @@ pub trait StringNameSpaceImpl: AsString { /// /// Determines a substring starting at offset `n` of each element in `array`. `n` can be /// negative, in which case the slice begins `n` characters from the start of the string. - fn str_tail(&self, n: &Series) -> PolarsResult { + fn str_tail(&self, n: &Column) -> PolarsResult { let ca = self.as_string(); let n = n.strict_cast(&DataType::Int64)?; diff --git a/crates/polars-ops/src/chunked_array/top_k.rs b/crates/polars-ops/src/chunked_array/top_k.rs index 9487ed17b4c7..c3bcd391e5af 100644 --- a/crates/polars-ops/src/chunked_array/top_k.rs +++ b/crates/polars-ops/src/chunked_array/top_k.rs @@ -145,8 +145,8 @@ fn top_k_binary_impl( ChunkedArray::with_chunk_like(ca, arr) } -pub fn top_k(s: &[Series], descending: bool) -> PolarsResult { - fn extract_target_and_k(s: &[Series]) -> PolarsResult<(usize, &Series)> { +pub fn top_k(s: &[Column], descending: bool) -> PolarsResult { + fn extract_target_and_k(s: &[Column]) -> PolarsResult<(usize, &Column)> { let k_s = &s[1]; polars_ensure!( k_s.len() == 1, @@ -197,20 +197,20 @@ pub fn top_k(s: &[Series], descending: bool) -> PolarsResult { let s = src.to_physical_repr(); match s.dtype() { - DataType::Boolean => Ok(top_k_bool_impl(s.bool().unwrap(), k, descending).into_series()), + DataType::Boolean => Ok(top_k_bool_impl(s.bool().unwrap(), k, descending).into_column()), DataType::String => { let ca = top_k_binary_impl(&s.str().unwrap().as_binary(), k, descending); let ca = unsafe { ca.to_string_unchecked() }; - Ok(ca.into_series()) + Ok(ca.into_column()) }, - DataType::Binary => Ok(top_k_binary_impl(s.binary().unwrap(), k, descending).into_series()), + DataType::Binary => Ok(top_k_binary_impl(s.binary().unwrap(), k, descending).into_column()), #[cfg(feature = "dtype-decimal")] DataType::Decimal(_, _) => { let src = src.decimal().unwrap(); let ca = top_k_num_impl(src, k, descending); let mut lca = DecimalChunked::new_logical(ca); lca.2 = Some(DataType::Decimal(src.precision(), Some(src.scale()))); - Ok(lca.into_series()) + Ok(lca.into_column()) }, DataType::Null => Ok(src.slice(0, k)), #[cfg(feature = "dtype-struct")] @@ -221,7 +221,7 @@ pub fn top_k(s: &[Series], descending: bool) -> PolarsResult { _dt => { macro_rules! dispatch { ($ca:expr) => {{ - top_k_num_impl($ca, k, descending).into_series() + top_k_num_impl($ca, k, descending).into_column() }}; } unsafe { downcast_as_macro_arg_physical!(&s, dispatch).cast_unchecked(origin_dtype) } @@ -229,9 +229,9 @@ pub fn top_k(s: &[Series], descending: bool) -> PolarsResult { } } -pub fn top_k_by(s: &[Column], descending: Vec) -> PolarsResult { +pub fn top_k_by(s: &[Column], descending: Vec) -> PolarsResult { /// Return (k, src, by) - fn extract_parameters(s: &[Column]) -> PolarsResult<(usize, &Series, &[Column])> { + fn extract_parameters(s: &[Column]) -> PolarsResult<(usize, &Column, &[Column])> { let k_s = &s[1]; polars_ensure!( @@ -243,7 +243,7 @@ pub fn top_k_by(s: &[Column], descending: Vec) -> PolarsResult { polars_bail!(ComputeError: "`k` must be set for `top_k`") }; - let src = &s[0].as_materialized_series(); + let src = &s[0]; let by = &s[2..]; @@ -271,10 +271,10 @@ pub fn top_k_by(s: &[Column], descending: Vec) -> PolarsResult { fn top_k_by_impl( k: usize, - src: &Series, + src: &Column, by: &[Column], descending: Vec, -) -> PolarsResult { +) -> PolarsResult { if src.is_empty() { return Ok(src.clone()); } @@ -289,6 +289,6 @@ fn top_k_by_impl( let idx = _arg_bottom_k(k, by, &mut sort_options)?; - let result = unsafe { src.take_unchecked(&idx.into_inner()) }; - Ok(result) + let result = unsafe { src.as_materialized_series().take_unchecked(&idx.into_inner()) }; + Ok(result.into()) } diff --git a/crates/polars-ops/src/frame/pivot/unpivot.rs b/crates/polars-ops/src/frame/pivot/unpivot.rs index ea1bd83c2791..89c38e88c37b 100644 --- a/crates/polars-ops/src/frame/pivot/unpivot.rs +++ b/crates/polars-ops/src/frame/pivot/unpivot.rs @@ -1,7 +1,8 @@ use arrow::array::{MutableArray, MutablePlString}; use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked; use polars_core::datatypes::{DataType, PlSmallStr}; -use polars_core::frame::{Column, DataFrame}; +use polars_core::frame::column::Column; +use polars_core::frame::DataFrame; use polars_core::prelude::{IntoVec, Series, UnpivotArgsIR}; use polars_core::utils::try_get_supertype; use polars_error::{polars_err, PolarsResult}; @@ -173,7 +174,8 @@ pub trait UnpivotDF: IntoDf { // SAFETY: // The give dtype is correct let values = - unsafe { Series::from_chunks_and_dtype_unchecked(value_name, vec![values_arr], &st) }.into(); + unsafe { Series::from_chunks_and_dtype_unchecked(value_name, vec![values_arr], &st) } + .into(); let variable_col = variable_col.as_box(); // SAFETY: @@ -184,7 +186,8 @@ pub trait UnpivotDF: IntoDf { vec![variable_col], &DataType::String, ) - }.into(); + } + .into(); ids.hstack_mut(&[variables, values])?; diff --git a/crates/polars-ops/src/series/ops/abs.rs b/crates/polars-ops/src/series/ops/abs.rs index 5a84678df591..21a1213ae1d1 100644 --- a/crates/polars-ops/src/series/ops/abs.rs +++ b/crates/polars-ops/src/series/ops/abs.rs @@ -1,7 +1,10 @@ use polars_core::prelude::*; /// Convert numerical values to their absolute value. -pub fn abs(s: &Series) -> PolarsResult { +pub fn abs(c: &Column) -> PolarsResult { + // @scalar-opt + let s = c.as_materialized_series(); + use DataType::*; let out = match s.dtype() { #[cfg(feature = "dtype-i8")] @@ -31,5 +34,5 @@ pub fn abs(s: &Series) -> PolarsResult { dt if dt.is_unsigned_integer() => s.clone(), dt => polars_bail!(opq = abs, dt), }; - Ok(out) + Ok(out.into()) } diff --git a/crates/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs index cba643cf98e9..08d40e187781 100644 --- a/crates/polars-ops/src/series/ops/cut.rs +++ b/crates/polars-ops/src/series/ops/cut.rs @@ -2,12 +2,12 @@ use polars_core::prelude::*; use polars_utils::format_pl_smallstr; fn map_cats( - s: &Series, + s: &Column, labels: &[PlSmallStr], sorted_breaks: &[f64], left_closed: bool, include_breaks: bool, -) -> PolarsResult { +) -> PolarsResult { let out_name = PlSmallStr::from_static("category"); // Create new categorical and pre-register labels for consistent categorical indexes. @@ -58,12 +58,12 @@ fn map_cats( }); let outvals = vec![ - brk_vals.finish().into_series(), + brk_vals.finish().into_column(), bld.finish() ._with_fast_unique(label_has_value.iter().all(bool::clone)) - .into_series(), + .into_column(), ]; - Ok(StructChunked::from_series(out_name, &outvals)?.into_series()) + Ok(StructChunked::from_columns(out_name, &outvals)?.into_column()) } else { Ok(bld .drain_iter_and_finish(s_iter.map(|opt| { @@ -74,7 +74,7 @@ fn map_cats( }) })) ._with_fast_unique(label_has_value.iter().all(bool::clone)) - .into_series()) + .into_column()) } } @@ -96,12 +96,12 @@ pub fn compute_labels(breaks: &[f64], left_closed: bool) -> PolarsResult, labels: Option>, left_closed: bool, include_breaks: bool, -) -> PolarsResult { +) -> PolarsResult { // Breaks must be sorted to cut inputs properly. polars_ensure!(!breaks.iter().any(|x| x.is_nan()), ComputeError: "breaks cannot be NaN"); breaks.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); @@ -122,13 +122,13 @@ pub fn cut( } pub fn qcut( - s: &Series, + s: &Column, probs: Vec, labels: Option>, left_closed: bool, allow_duplicates: bool, include_breaks: bool, -) -> PolarsResult { +) -> PolarsResult { polars_ensure!(!probs.iter().any(|x| x.is_nan()), ComputeError: "quantiles cannot be NaN"); if s.null_count() == s.len() { @@ -177,7 +177,7 @@ mod test { use super::map_cats; - let s = Series::new("x".into(), &[1, 2, 3, 4, 5]); + let s = Column::new("x".into(), &[1, 2, 3, 4, 5]); let labels = &["a", "b", "c"].map(PlSmallStr::from_static); let breaks = &[2.0, 4.0]; diff --git a/crates/polars-ops/src/series/ops/duration.rs b/crates/polars-ops/src/series/ops/duration.rs index 1d5868260e64..bed8c8d90119 100644 --- a/crates/polars-ops/src/series/ops/duration.rs +++ b/crates/polars-ops/src/series/ops/duration.rs @@ -1,11 +1,11 @@ use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS, SECONDS_IN_DAY}; use polars_core::datatypes::{AnyValue, DataType, TimeUnit}; -use polars_core::prelude::Series; +use polars_core::prelude::Column; use polars_error::PolarsResult; -pub fn impl_duration(s: &[Series], time_unit: TimeUnit) -> PolarsResult { +pub fn impl_duration(s: &[Column], time_unit: TimeUnit) -> PolarsResult { if s.iter().any(|s| s.is_empty()) { - return Ok(Series::new_empty( + return Ok(Column::new_empty( s[0].name().clone(), &DataType::Duration(time_unit), )); @@ -21,8 +21,8 @@ pub fn impl_duration(s: &[Series], time_unit: TimeUnit) -> PolarsResult let mut microseconds = s[6].cast(&DataType::Int64).unwrap(); let mut nanoseconds = s[7].cast(&DataType::Int64).unwrap(); - let is_scalar = |s: &Series| s.len() == 1; - let is_zero_scalar = |s: &Series| is_scalar(s) && s.get(0).unwrap() == AnyValue::Int64(0); + let is_scalar = |s: &Column| s.len() == 1; + let is_zero_scalar = |s: &Column| is_scalar(s) && s.get(0).unwrap() == AnyValue::Int64(0); // Process subseconds let max_len = s.iter().map(|s| s.len()).max().unwrap(); @@ -87,5 +87,7 @@ pub fn impl_duration(s: &[Series], time_unit: TimeUnit) -> PolarsResult duration = (duration + weeks * (multiplier * SECONDS_IN_DAY * 7))?; } - duration.cast(&DataType::Duration(time_unit)) + duration + .cast(&DataType::Duration(time_unit)) + .map(Column::from) } diff --git a/crates/polars-ops/src/series/ops/fused.rs b/crates/polars-ops/src/series/ops/fused.rs index 16b06f76c479..a2b3215add95 100644 --- a/crates/polars-ops/src/series/ops/fused.rs +++ b/crates/polars-ops/src/series/ops/fused.rs @@ -41,17 +41,20 @@ fn fma_ca( ChunkedArray::from_chunk_iter(a.name().clone(), chunks) } -pub fn fma_series(a: &Series, b: &Series, c: &Series) -> Series { +pub fn fma_columns(a: &Column, b: &Column, c: &Column) -> Column { if a.len() == b.len() && a.len() == c.len() { with_match_physical_numeric_polars_type!(a.dtype(), |$T| { - let a: &ChunkedArray<$T> = a.as_ref().as_ref().as_ref(); - let b: &ChunkedArray<$T> = b.as_ref().as_ref().as_ref(); - let c: &ChunkedArray<$T> = c.as_ref().as_ref().as_ref(); + let a: &ChunkedArray<$T> = a.as_materialized_series().as_ref().as_ref().as_ref(); + let b: &ChunkedArray<$T> = b.as_materialized_series().as_ref().as_ref().as_ref(); + let c: &ChunkedArray<$T> = c.as_materialized_series().as_ref().as_ref().as_ref(); - fma_ca(a, b, c).into_series() + fma_ca(a, b, c).into_column() }) } else { - (a + &(b * c).unwrap()).unwrap() + (a.as_materialized_series() + + &(b.as_materialized_series() * c.as_materialized_series()).unwrap()) + .unwrap() + .into() } } @@ -92,17 +95,20 @@ fn fsm_ca( ChunkedArray::from_chunk_iter(a.name().clone(), chunks) } -pub fn fsm_series(a: &Series, b: &Series, c: &Series) -> Series { +pub fn fsm_columns(a: &Column, b: &Column, c: &Column) -> Column { if a.len() == b.len() && a.len() == c.len() { with_match_physical_numeric_polars_type!(a.dtype(), |$T| { - let a: &ChunkedArray<$T> = a.as_ref().as_ref().as_ref(); - let b: &ChunkedArray<$T> = b.as_ref().as_ref().as_ref(); - let c: &ChunkedArray<$T> = c.as_ref().as_ref().as_ref(); + let a: &ChunkedArray<$T> = a.as_materialized_series().as_ref().as_ref().as_ref(); + let b: &ChunkedArray<$T> = b.as_materialized_series().as_ref().as_ref().as_ref(); + let c: &ChunkedArray<$T> = c.as_materialized_series().as_ref().as_ref().as_ref(); - fsm_ca(a, b, c).into_series() + fsm_ca(a, b, c).into_column() }) } else { - (a - &(b * c).unwrap()).unwrap() + (a.as_materialized_series() + - &(b.as_materialized_series() * c.as_materialized_series()).unwrap()) + .unwrap() + .into() } } @@ -142,16 +148,18 @@ fn fms_ca( ChunkedArray::from_chunk_iter(a.name().clone(), chunks) } -pub fn fms_series(a: &Series, b: &Series, c: &Series) -> Series { +pub fn fms_columns(a: &Column, b: &Column, c: &Column) -> Column { if a.len() == b.len() && a.len() == c.len() { with_match_physical_numeric_polars_type!(a.dtype(), |$T| { - let a: &ChunkedArray<$T> = a.as_ref().as_ref().as_ref(); - let b: &ChunkedArray<$T> = b.as_ref().as_ref().as_ref(); - let c: &ChunkedArray<$T> = c.as_ref().as_ref().as_ref(); + let a: &ChunkedArray<$T> = a.as_materialized_series().as_ref().as_ref().as_ref(); + let b: &ChunkedArray<$T> = b.as_materialized_series().as_ref().as_ref().as_ref(); + let c: &ChunkedArray<$T> = c.as_materialized_series().as_ref().as_ref().as_ref(); - fms_ca(a, b, c).into_series() + fms_ca(a, b, c).into_column() }) } else { - (&(a * b).unwrap() - c).unwrap() + (&(a.as_materialized_series() * b.as_materialized_series()).unwrap() + - c.as_materialized_series()) + .unwrap().into() } } diff --git a/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs b/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs index 06a8378055da..328d67763cfb 100644 --- a/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs +++ b/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs @@ -263,29 +263,29 @@ where } } -pub fn interpolate_by(s: &Series, by: &Series, by_is_sorted: bool) -> PolarsResult { +pub fn interpolate_by(s: &Column, by: &Column, by_is_sorted: bool) -> PolarsResult { polars_ensure!(s.len() == by.len(), InvalidOperation: "`by` column must be the same length as Series ({}), got {}", s.len(), by.len()); fn func( ca: &ChunkedArray, by: &ChunkedArray, is_sorted: bool, - ) -> PolarsResult + ) -> PolarsResult where T: PolarsNumericType, F: PolarsNumericType, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { if is_sorted { interpolate_impl_by_sorted(ca, by, |y_start, y_end, x, out| unsafe { signed_interp_by_sorted(y_start, y_end, x, out) }) - .map(|x| x.into_series()) + .map(|x| x.into_column()) } else { interpolate_impl_by(ca, by, |y_start, y_end, x, out, sorting_indices| unsafe { signed_interp_by(y_start, y_end, x, out, sorting_indices) }) - .map(|x| x.into_series()) + .map(|x| x.into_column()) } } diff --git a/crates/polars-ops/src/series/ops/not.rs b/crates/polars-ops/src/series/ops/not.rs index 2bb153166254..b6abf1559dce 100644 --- a/crates/polars-ops/src/series/ops/not.rs +++ b/crates/polars-ops/src/series/ops/not.rs @@ -9,8 +9,8 @@ pub fn negate_bitwise(s: &Series) -> PolarsResult { DataType::Boolean => Ok(s.bool().unwrap().not().into_series()), dt if dt.is_integer() => { with_match_physical_integer_polars_type!(dt, |$T| { - let ca: &ChunkedArray<$T> = s.as_any().downcast_ref().unwrap(); - Ok(ca.apply_values(|v| !v).into_series()) + let ca: &ChunkedArray<$T> = s.as_any().downcast_ref().unwrap(); + Ok(ca.apply_values(|v| !v).into_series()) }) }, dt => polars_bail!(InvalidOperation: "dtype {:?} not supported in 'not' operation", dt), diff --git a/crates/polars-ops/src/series/ops/rle.rs b/crates/polars-ops/src/series/ops/rle.rs index 8659512673f1..6df79825b706 100644 --- a/crates/polars-ops/src/series/ops/rle.rs +++ b/crates/polars-ops/src/series/ops/rle.rs @@ -2,14 +2,14 @@ use polars_core::prelude::*; use polars_core::series::IsSorted; /// Get the lengths of runs of identical values. -pub fn rle(s: &Series) -> PolarsResult { +pub fn rle(s: &Column) -> PolarsResult { let (s1, s2) = (s.slice(0, s.len() - 1), s.slice(1, s.len())); - let s_neq = s1.not_equal_missing(&s2)?; + let s_neq = s1.as_materialized_series().not_equal_missing(s2.as_materialized_series())?; let n_runs = s_neq.sum().ok_or_else(|| polars_err!(InvalidOperation: "could not evaluate 'rle_id' on series of dtype: {}", s.dtype()))? + 1; let mut lengths = Vec::::with_capacity(n_runs as usize); lengths.push(1); - let mut vals = Series::new_empty(PlSmallStr::from_static("value"), s.dtype()); + let mut vals = Column::new_empty(PlSmallStr::from_static("value"), s.dtype()); let vals = vals.extend(&s.head(Some(1)))?.extend(&s2.filter(&s_neq)?)?; let mut idx = 0; @@ -26,19 +26,19 @@ pub fn rle(s: &Series) -> PolarsResult { } let outvals = vec![ - Series::from_vec(PlSmallStr::from_static("len"), lengths), + Series::from_vec(PlSmallStr::from_static("len"), lengths).into(), vals.to_owned(), ]; - Ok(StructChunked::from_series(s.name().clone(), &outvals)?.into_series()) + Ok(StructChunked::from_columns(s.name().clone(), &outvals)?.into_column()) } /// Similar to `rle`, but maps values to run IDs. -pub fn rle_id(s: &Series) -> PolarsResult { +pub fn rle_id(s: &Column) -> PolarsResult { if s.len() == 0 { - return Ok(Series::new_empty(s.name().clone(), &IDX_DTYPE)); + return Ok(Column::new_empty(s.name().clone(), &IDX_DTYPE)); } let (s1, s2) = (s.slice(0, s.len() - 1), s.slice(1, s.len())); - let s_neq = s1.not_equal_missing(&s2)?; + let s_neq = s1.as_materialized_series().not_equal_missing(s2.as_materialized_series())?; let mut out = Vec::::with_capacity(s.len()); let mut last = 0; @@ -52,5 +52,5 @@ pub fn rle_id(s: &Series) -> PolarsResult { } Ok(IdxCa::from_vec(s.name().clone(), out) .with_sorted_flag(IsSorted::Ascending) - .into_series()) + .into_column()) } diff --git a/crates/polars-plan/src/dsl/array.rs b/crates/polars-plan/src/dsl/array.rs index 558a7a98a42a..a5b7db2e8437 100644 --- a/crates/polars-plan/src/dsl/array.rs +++ b/crates/polars-plan/src/dsl/array.rs @@ -164,7 +164,7 @@ impl ArrayNameSpace { move |s| { s.array()? .to_struct(name_generator.clone()) - .map(|s| Some(s.into_series())) + .map(|s| Some(s.into_column())) }, GetOutput::map_dtype(move |dt: &DataType| { let DataType::Array(inner, width) = dt else { diff --git a/crates/polars-plan/src/dsl/expr.rs b/crates/polars-plan/src/dsl/expr.rs index a8c48cd17fb8..0bbecd7e1d77 100644 --- a/crates/polars-plan/src/dsl/expr.rs +++ b/crates/polars-plan/src/dsl/expr.rs @@ -153,7 +153,7 @@ pub enum Expr { /// function arguments input: Vec, /// function to apply - function: SpecialEq>, + function: SpecialEq>, /// output dtype of the function output_type: GetOutput, options: FunctionOptions, diff --git a/crates/polars-plan/src/dsl/expr_dyn_fn.rs b/crates/polars-plan/src/dsl/expr_dyn_fn.rs index 9ac6f872eed8..d79858706385 100644 --- a/crates/polars-plan/src/dsl/expr_dyn_fn.rs +++ b/crates/polars-plan/src/dsl/expr_dyn_fn.rs @@ -10,12 +10,12 @@ use serde::{Deserializer, Serializer}; use super::*; /// A wrapper trait for any closure `Fn(Vec) -> PolarsResult` -pub trait SeriesUdf: Send + Sync { +pub trait ColumnsUdf: Send + Sync { fn as_any(&self) -> &dyn std::any::Any { unimplemented!("as_any not implemented for this 'opaque' function") } - fn call_udf(&self, s: &mut [Series]) -> PolarsResult>; + fn call_udf(&self, s: &mut [Column]) -> PolarsResult>; fn try_serialize(&self, _buf: &mut Vec) -> PolarsResult<()> { polars_bail!(ComputeError: "serialization not supported for this 'opaque' function") @@ -31,7 +31,7 @@ pub trait SeriesUdf: Send + Sync { } #[cfg(feature = "serde")] -impl Serialize for SpecialEq> { +impl Serialize for SpecialEq> { fn serialize(&self, serializer: S) -> std::result::Result where S: Serializer, @@ -46,7 +46,7 @@ impl Serialize for SpecialEq> { } #[cfg(feature = "serde")] -impl<'a> Deserialize<'a> for SpecialEq> { +impl<'a> Deserialize<'a> for SpecialEq> { fn deserialize(deserializer: D) -> std::result::Result where D: Deserializer<'a>, @@ -75,42 +75,42 @@ impl<'a> Deserialize<'a> for SpecialEq> { } } -impl SeriesUdf for F +impl ColumnsUdf for F where - F: Fn(&mut [Series]) -> PolarsResult> + Send + Sync, + F: Fn(&mut [Column]) -> PolarsResult> + Send + Sync, { - fn call_udf(&self, s: &mut [Series]) -> PolarsResult> { + fn call_udf(&self, s: &mut [Column]) -> PolarsResult> { self(s) } } -impl Debug for dyn SeriesUdf { +impl Debug for dyn ColumnsUdf { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "SeriesUdf") + write!(f, "ColumnUdf") } } -/// A wrapper trait for any binary closure `Fn(Series, Series) -> PolarsResult` -pub trait SeriesBinaryUdf: Send + Sync { - fn call_udf(&self, a: Series, b: Series) -> PolarsResult; +/// A wrapper trait for any binary closure `Fn(Column, Column) -> PolarsResult` +pub trait ColumnBinaryUdf: Send + Sync { + fn call_udf(&self, a: Column, b: Column) -> PolarsResult; } -impl SeriesBinaryUdf for F +impl ColumnBinaryUdf for F where - F: Fn(Series, Series) -> PolarsResult + Send + Sync, + F: Fn(Column, Column) -> PolarsResult + Send + Sync, { - fn call_udf(&self, a: Series, b: Series) -> PolarsResult { + fn call_udf(&self, a: Column, b: Column) -> PolarsResult { self(a, b) } } -impl Debug for dyn SeriesBinaryUdf { +impl Debug for dyn ColumnBinaryUdf { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "SeriesBinaryUdf") + write!(f, "ColumnBinaryUdf") } } -impl Default for SpecialEq> { +impl Default for SpecialEq> { fn default() -> Self { panic!("implementation error"); } diff --git a/crates/polars-plan/src/dsl/function_expr/abs.rs b/crates/polars-plan/src/dsl/function_expr/abs.rs index 45e99ea42648..1f0435e05772 100644 --- a/crates/polars-plan/src/dsl/function_expr/abs.rs +++ b/crates/polars-plan/src/dsl/function_expr/abs.rs @@ -1,5 +1,5 @@ use super::*; -pub(super) fn abs(s: &Series) -> PolarsResult { +pub(super) fn abs(s: &Column) -> PolarsResult { polars_ops::prelude::abs(s) } diff --git a/crates/polars-plan/src/dsl/function_expr/arg_where.rs b/crates/polars-plan/src/dsl/function_expr/arg_where.rs index 8f77be0724bd..ab0afba55960 100644 --- a/crates/polars-plan/src/dsl/function_expr/arg_where.rs +++ b/crates/polars-plan/src/dsl/function_expr/arg_where.rs @@ -2,11 +2,11 @@ use polars_core::utils::arrow::bitmap::utils::SlicesIterator; use super::*; -pub(super) fn arg_where(s: &mut [Series]) -> PolarsResult> { +pub(super) fn arg_where(s: &mut [Column]) -> PolarsResult> { let predicate = s[0].bool()?; if predicate.is_empty() { - Ok(Some(Series::full_null( + Ok(Some(Column::full_null( predicate.name().clone(), 0, &IDX_DTYPE, @@ -37,6 +37,6 @@ pub(super) fn arg_where(s: &mut [Series]) -> PolarsResult> { total_offset += arr.len(); }); let ca = IdxCa::with_chunk(predicate.name().clone(), IdxArr::from_vec(out)); - Ok(Some(ca.into_series())) + Ok(Some(ca.into_column())) } } diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index 0de5e9d99883..d1c92cba1df2 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -101,7 +101,7 @@ impl Display for ArrayFunction { } } -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: ArrayFunction) -> Self { use ArrayFunction::*; match func { @@ -133,101 +133,101 @@ impl From for SpecialEq> { } } -pub(super) fn max(s: &Series) -> PolarsResult { - Ok(s.array()?.array_max()) +pub(super) fn max(s: &Column) -> PolarsResult { + Ok(s.array()?.array_max().into()) } -pub(super) fn min(s: &Series) -> PolarsResult { - Ok(s.array()?.array_min()) +pub(super) fn min(s: &Column) -> PolarsResult { + Ok(s.array()?.array_min().into()) } -pub(super) fn sum(s: &Series) -> PolarsResult { - s.array()?.array_sum() +pub(super) fn sum(s: &Column) -> PolarsResult { + s.array()?.array_sum().map(Column::from) } -pub(super) fn std(s: &Series, ddof: u8) -> PolarsResult { - s.array()?.array_std(ddof) +pub(super) fn std(s: &Column, ddof: u8) -> PolarsResult { + s.array()?.array_std(ddof).map(Column::from) } -pub(super) fn var(s: &Series, ddof: u8) -> PolarsResult { - s.array()?.array_var(ddof) +pub(super) fn var(s: &Column, ddof: u8) -> PolarsResult { + s.array()?.array_var(ddof).map(Column::from) } -pub(super) fn median(s: &Series) -> PolarsResult { - s.array()?.array_median() +pub(super) fn median(s: &Column) -> PolarsResult { + s.array()?.array_median().map(Column::from) } -pub(super) fn unique(s: &Series, stable: bool) -> PolarsResult { +pub(super) fn unique(s: &Column, stable: bool) -> PolarsResult { let ca = s.array()?; let out = if stable { ca.array_unique_stable() } else { ca.array_unique() }; - out.map(|ca| ca.into_series()) + out.map(|ca| ca.into_column()) } -pub(super) fn n_unique(s: &Series) -> PolarsResult { - Ok(s.array()?.array_n_unique()?.into_series()) +pub(super) fn n_unique(s: &Column) -> PolarsResult { + Ok(s.array()?.array_n_unique()?.into_column()) } -pub(super) fn to_list(s: &Series) -> PolarsResult { +pub(super) fn to_list(s: &Column) -> PolarsResult { let list_dtype = map_array_dtype_to_list_dtype(s.dtype())?; s.cast(&list_dtype) } #[cfg(feature = "array_any_all")] -pub(super) fn any(s: &Series) -> PolarsResult { - s.array()?.array_any() +pub(super) fn any(s: &Column) -> PolarsResult { + s.array()?.array_any().map(Column::from) } #[cfg(feature = "array_any_all")] -pub(super) fn all(s: &Series) -> PolarsResult { - s.array()?.array_all() +pub(super) fn all(s: &Column) -> PolarsResult { + s.array()?.array_all().map(Column::from) } -pub(super) fn sort(s: &Series, options: SortOptions) -> PolarsResult { - Ok(s.array()?.array_sort(options)?.into_series()) +pub(super) fn sort(s: &Column, options: SortOptions) -> PolarsResult { + Ok(s.array()?.array_sort(options)?.into_column()) } -pub(super) fn reverse(s: &Series) -> PolarsResult { - Ok(s.array()?.array_reverse().into_series()) +pub(super) fn reverse(s: &Column) -> PolarsResult { + Ok(s.array()?.array_reverse().into_column()) } -pub(super) fn arg_min(s: &Series) -> PolarsResult { - Ok(s.array()?.array_arg_min().into_series()) +pub(super) fn arg_min(s: &Column) -> PolarsResult { + Ok(s.array()?.array_arg_min().into_column()) } -pub(super) fn arg_max(s: &Series) -> PolarsResult { - Ok(s.array()?.array_arg_max().into_series()) +pub(super) fn arg_max(s: &Column) -> PolarsResult { + Ok(s.array()?.array_arg_max().into_column()) } -pub(super) fn get(s: &[Series], null_on_oob: bool) -> PolarsResult { +pub(super) fn get(s: &[Column], null_on_oob: bool) -> PolarsResult { let ca = s[0].array()?; let index = s[1].cast(&DataType::Int64)?; let index = index.i64().unwrap(); - ca.array_get(index, null_on_oob) + ca.array_get(index, null_on_oob).map(Column::from) } -pub(super) fn join(s: &[Series], ignore_nulls: bool) -> PolarsResult { +pub(super) fn join(s: &[Column], ignore_nulls: bool) -> PolarsResult { let ca = s[0].array()?; let separator = s[1].str()?; - ca.array_join(separator, ignore_nulls) + ca.array_join(separator, ignore_nulls).map(Column::from) } #[cfg(feature = "is_in")] -pub(super) fn contains(s: &[Series]) -> PolarsResult { +pub(super) fn contains(s: &[Column]) -> PolarsResult { let array = &s[0]; let item = &s[1]; polars_ensure!(matches!(array.dtype(), DataType::Array(_, _)), SchemaMismatch: "invalid series dtype: expected `Array`, got `{}`", array.dtype(), ); - Ok(is_in(item, array)? + Ok(is_in(item.as_materialized_series(), array.as_materialized_series())? .with_name(array.name().clone()) - .into_series()) + .into_column()) } #[cfg(feature = "array_count")] -pub(super) fn count_matches(args: &[Series]) -> PolarsResult { +pub(super) fn count_matches(args: &[Column]) -> PolarsResult { let s = &args[0]; let element = &args[1]; polars_ensure!( @@ -236,12 +236,12 @@ pub(super) fn count_matches(args: &[Series]) -> PolarsResult { element.len() ); let ca = s.array()?; - ca.array_count_matches(element.get(0).unwrap()) + ca.array_count_matches(element.get(0).unwrap()).map(Column::from) } -pub(super) fn shift(s: &[Series]) -> PolarsResult { +pub(super) fn shift(s: &[Column]) -> PolarsResult { let ca = s[0].array()?; let n = &s[1]; - ca.array_shift(n) + ca.array_shift(n.as_materialized_series()).map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/binary.rs b/crates/polars-plan/src/dsl/function_expr/binary.rs index f803ba0ba952..88f3ad71b545 100644 --- a/crates/polars-plan/src/dsl/function_expr/binary.rs +++ b/crates/polars-plan/src/dsl/function_expr/binary.rs @@ -57,7 +57,7 @@ impl Display for BinaryFunction { } } -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: BinaryFunction) -> Self { use BinaryFunction::*; match func { @@ -83,62 +83,62 @@ impl From for SpecialEq> { } } -pub(super) fn contains(s: &[Series]) -> PolarsResult { +pub(super) fn contains(s: &[Column]) -> PolarsResult { let ca = s[0].binary()?; let lit = s[1].binary()?; Ok(ca .contains_chunked(lit) .with_name(ca.name().clone()) - .into_series()) + .into_column()) } -pub(super) fn ends_with(s: &[Series]) -> PolarsResult { +pub(super) fn ends_with(s: &[Column]) -> PolarsResult { let ca = s[0].binary()?; let suffix = s[1].binary()?; Ok(ca .ends_with_chunked(suffix) .with_name(ca.name().clone()) - .into_series()) + .into_column()) } -pub(super) fn starts_with(s: &[Series]) -> PolarsResult { +pub(super) fn starts_with(s: &[Column]) -> PolarsResult { let ca = s[0].binary()?; let prefix = s[1].binary()?; Ok(ca .starts_with_chunked(prefix) .with_name(ca.name().clone()) - .into_series()) + .into_column()) } -pub(super) fn size_bytes(s: &Series) -> PolarsResult { +pub(super) fn size_bytes(s: &Column) -> PolarsResult { let ca = s.binary()?; - Ok(ca.size_bytes().into_series()) + Ok(ca.size_bytes().into_column()) } #[cfg(feature = "binary_encoding")] -pub(super) fn hex_decode(s: &Series, strict: bool) -> PolarsResult { +pub(super) fn hex_decode(s: &Column, strict: bool) -> PolarsResult { let ca = s.binary()?; - ca.hex_decode(strict).map(|ok| ok.into_series()) + ca.hex_decode(strict).map(|ok| ok.into_column()) } #[cfg(feature = "binary_encoding")] -pub(super) fn hex_encode(s: &Series) -> PolarsResult { +pub(super) fn hex_encode(s: &Column) -> PolarsResult { let ca = s.binary()?; - Ok(ca.hex_encode()) + Ok(ca.hex_encode().into()) } #[cfg(feature = "binary_encoding")] -pub(super) fn base64_decode(s: &Series, strict: bool) -> PolarsResult { +pub(super) fn base64_decode(s: &Column, strict: bool) -> PolarsResult { let ca = s.binary()?; - ca.base64_decode(strict).map(|ok| ok.into_series()) + ca.base64_decode(strict).map(|ok| ok.into_column()) } #[cfg(feature = "binary_encoding")] -pub(super) fn base64_encode(s: &Series) -> PolarsResult { +pub(super) fn base64_encode(s: &Column) -> PolarsResult { let ca = s.binary()?; - Ok(ca.base64_encode()) + Ok(ca.base64_encode().into()) } impl From for FunctionExpr { diff --git a/crates/polars-plan/src/dsl/function_expr/boolean.rs b/crates/polars-plan/src/dsl/function_expr/boolean.rs index d00045c0d3f9..089fed3dc51b 100644 --- a/crates/polars-plan/src/dsl/function_expr/boolean.rs +++ b/crates/polars-plan/src/dsl/function_expr/boolean.rs @@ -93,7 +93,7 @@ impl Display for BooleanFunction { } } -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: BooleanFunction) -> Self { use BooleanFunction::*; match func { @@ -130,89 +130,99 @@ impl From for FunctionExpr { } } -fn any(s: &Series, ignore_nulls: bool) -> PolarsResult { +fn any(s: &Column, ignore_nulls: bool) -> PolarsResult { let ca = s.bool()?; if ignore_nulls { - Ok(Series::new(s.name().clone(), [ca.any()])) + Ok(Column::new(s.name().clone(), [ca.any()])) } else { - Ok(Series::new(s.name().clone(), [ca.any_kleene()])) + Ok(Column::new(s.name().clone(), [ca.any_kleene()])) } } -fn all(s: &Series, ignore_nulls: bool) -> PolarsResult { +fn all(s: &Column, ignore_nulls: bool) -> PolarsResult { let ca = s.bool()?; if ignore_nulls { - Ok(Series::new(s.name().clone(), [ca.all()])) + Ok(Column::new(s.name().clone(), [ca.all()])) } else { - Ok(Series::new(s.name().clone(), [ca.all_kleene()])) + Ok(Column::new(s.name().clone(), [ca.all_kleene()])) } } -fn is_null(s: &Series) -> PolarsResult { - Ok(s.is_null().into_series()) +fn is_null(s: &Column) -> PolarsResult { + Ok(s.is_null().into_column()) } -fn is_not_null(s: &Series) -> PolarsResult { - Ok(s.is_not_null().into_series()) +fn is_not_null(s: &Column) -> PolarsResult { + Ok(s.is_not_null().into_column()) } -fn is_finite(s: &Series) -> PolarsResult { - s.is_finite().map(|ca| ca.into_series()) +fn is_finite(s: &Column) -> PolarsResult { + s.is_finite().map(|ca| ca.into_column()) } -fn is_infinite(s: &Series) -> PolarsResult { - s.is_infinite().map(|ca| ca.into_series()) +fn is_infinite(s: &Column) -> PolarsResult { + s.is_infinite().map(|ca| ca.into_column()) } -pub(super) fn is_nan(s: &Series) -> PolarsResult { - s.is_nan().map(|ca| ca.into_series()) +pub(super) fn is_nan(s: &Column) -> PolarsResult { + s.is_nan().map(|ca| ca.into_column()) } -pub(super) fn is_not_nan(s: &Series) -> PolarsResult { - s.is_not_nan().map(|ca| ca.into_series()) +pub(super) fn is_not_nan(s: &Column) -> PolarsResult { + s.is_not_nan().map(|ca| ca.into_column()) } #[cfg(feature = "is_first_distinct")] -fn is_first_distinct(s: &Series) -> PolarsResult { - polars_ops::prelude::is_first_distinct(s).map(|ca| ca.into_series()) +fn is_first_distinct(s: &Column) -> PolarsResult { + polars_ops::prelude::is_first_distinct(s.as_materialized_series()).map(|ca| ca.into_column()) } #[cfg(feature = "is_last_distinct")] -fn is_last_distinct(s: &Series) -> PolarsResult { - polars_ops::prelude::is_last_distinct(s).map(|ca| ca.into_series()) +fn is_last_distinct(s: &Column) -> PolarsResult { + polars_ops::prelude::is_last_distinct(s.as_materialized_series()).map(|ca| ca.into_column()) } #[cfg(feature = "is_unique")] -fn is_unique(s: &Series) -> PolarsResult { - polars_ops::prelude::is_unique(s).map(|ca| ca.into_series()) +fn is_unique(s: &Column) -> PolarsResult { + polars_ops::prelude::is_unique(s.as_materialized_series()).map(|ca| ca.into_column()) } #[cfg(feature = "is_unique")] -fn is_duplicated(s: &Series) -> PolarsResult { - polars_ops::prelude::is_duplicated(s).map(|ca| ca.into_series()) +fn is_duplicated(s: &Column) -> PolarsResult { + polars_ops::prelude::is_duplicated(s.as_materialized_series()).map(|ca| ca.into_column()) } #[cfg(feature = "is_between")] -fn is_between(s: &[Series], closed: ClosedInterval) -> PolarsResult { +fn is_between(s: &[Column], closed: ClosedInterval) -> PolarsResult { let ser = &s[0]; let lower = &s[1]; let upper = &s[2]; - polars_ops::prelude::is_between(ser, lower, upper, closed).map(|ca| ca.into_series()) + polars_ops::prelude::is_between( + ser.as_materialized_series(), + lower.as_materialized_series(), + upper.as_materialized_series(), + closed, + ) + .map(|ca| ca.into_column()) } #[cfg(feature = "is_in")] -fn is_in(s: &mut [Series]) -> PolarsResult> { +fn is_in(s: &mut [Column]) -> PolarsResult> { let left = &s[0]; let other = &s[1]; - polars_ops::prelude::is_in(left, other).map(|ca| Some(ca.into_series())) + polars_ops::prelude::is_in( + left.as_materialized_series(), + other.as_materialized_series(), + ) + .map(|ca| Some(ca.into_column())) } -fn not(s: &Series) -> PolarsResult { - polars_ops::series::negate_bitwise(s) +fn not(s: &Column) -> PolarsResult { + polars_ops::series::negate_bitwise(s.as_materialized_series()).map(Column::from) } // We shouldn't hit these often only on very wide dataframes where we don't reduce to & expressions. -fn any_horizontal(s: &[Series]) -> PolarsResult { +fn any_horizontal(s: &[Column]) -> PolarsResult { let out = POOL .install(|| { s.par_iter() @@ -230,11 +240,11 @@ fn any_horizontal(s: &[Series]) -> PolarsResult { ) })? .with_name(s[0].name().clone()); - Ok(out.into_series()) + Ok(out.into_column()) } // We shouldn't hit these often only on very wide dataframes where we don't reduce to & expressions. -fn all_horizontal(s: &[Series]) -> PolarsResult { +fn all_horizontal(s: &[Column]) -> PolarsResult { let out = POOL .install(|| { s.par_iter() @@ -252,5 +262,5 @@ fn all_horizontal(s: &[Series]) -> PolarsResult { ) })? .with_name(s[0].name().clone()); - Ok(out.into_series()) + Ok(out.into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/bounds.rs b/crates/polars-plan/src/dsl/function_expr/bounds.rs index 0f14feb5675f..fb589cfbfb4d 100644 --- a/crates/polars-plan/src/dsl/function_expr/bounds.rs +++ b/crates/polars-plan/src/dsl/function_expr/bounds.rs @@ -1,23 +1,24 @@ use super::*; -pub(super) fn upper_bound(s: &Series) -> PolarsResult { +pub(super) fn upper_bound(s: &Column) -> PolarsResult { + // @scalar-opt let name = s.name().clone(); use DataType::*; let s = match s.dtype().to_physical() { #[cfg(feature = "dtype-i8")] - Int8 => Series::new(name, &[i8::MAX]), + Int8 => Column::new(name, &[i8::MAX]), #[cfg(feature = "dtype-i16")] - Int16 => Series::new(name, &[i16::MAX]), - Int32 => Series::new(name, &[i32::MAX]), - Int64 => Series::new(name, &[i64::MAX]), + Int16 => Column::new(name, &[i16::MAX]), + Int32 => Column::new(name, &[i32::MAX]), + Int64 => Column::new(name, &[i64::MAX]), #[cfg(feature = "dtype-u8")] - UInt8 => Series::new(name, &[u8::MAX]), + UInt8 => Column::new(name, &[u8::MAX]), #[cfg(feature = "dtype-u16")] - UInt16 => Series::new(name, &[u16::MAX]), - UInt32 => Series::new(name, &[u32::MAX]), - UInt64 => Series::new(name, &[u64::MAX]), - Float32 => Series::new(name, &[f32::INFINITY]), - Float64 => Series::new(name, &[f64::INFINITY]), + UInt16 => Column::new(name, &[u16::MAX]), + UInt32 => Column::new(name, &[u32::MAX]), + UInt64 => Column::new(name, &[u64::MAX]), + Float32 => Column::new(name, &[f32::INFINITY]), + Float64 => Column::new(name, &[f64::INFINITY]), dt => polars_bail!( ComputeError: "cannot determine upper bound for dtype `{}`", dt, ), @@ -25,24 +26,25 @@ pub(super) fn upper_bound(s: &Series) -> PolarsResult { Ok(s) } -pub(super) fn lower_bound(s: &Series) -> PolarsResult { +pub(super) fn lower_bound(s: &Column) -> PolarsResult { + // @scalar-opt let name = s.name().clone(); use DataType::*; let s = match s.dtype().to_physical() { #[cfg(feature = "dtype-i8")] - Int8 => Series::new(name, &[i8::MIN]), + Int8 => Column::new(name, &[i8::MIN]), #[cfg(feature = "dtype-i16")] - Int16 => Series::new(name, &[i16::MIN]), - Int32 => Series::new(name, &[i32::MIN]), - Int64 => Series::new(name, &[i64::MIN]), + Int16 => Column::new(name, &[i16::MIN]), + Int32 => Column::new(name, &[i32::MIN]), + Int64 => Column::new(name, &[i64::MIN]), #[cfg(feature = "dtype-u8")] - UInt8 => Series::new(name, &[u8::MIN]), + UInt8 => Column::new(name, &[u8::MIN]), #[cfg(feature = "dtype-u16")] - UInt16 => Series::new(name, &[u16::MIN]), - UInt32 => Series::new(name, &[u32::MIN]), - UInt64 => Series::new(name, &[u64::MIN]), - Float32 => Series::new(name, &[f32::NEG_INFINITY]), - Float64 => Series::new(name, &[f64::NEG_INFINITY]), + UInt16 => Column::new(name, &[u16::MIN]), + UInt32 => Column::new(name, &[u32::MIN]), + UInt64 => Column::new(name, &[u64::MIN]), + Float32 => Column::new(name, &[f32::NEG_INFINITY]), + Float64 => Column::new(name, &[f64::NEG_INFINITY]), dt => polars_bail!( ComputeError: "cannot determine lower bound for dtype `{}`", dt, ), diff --git a/crates/polars-plan/src/dsl/function_expr/business.rs b/crates/polars-plan/src/dsl/function_expr/business.rs index 0d4fc2939d98..c488666503ae 100644 --- a/crates/polars-plan/src/dsl/function_expr/business.rs +++ b/crates/polars-plan/src/dsl/function_expr/business.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use crate::dsl::SpecialEq; use crate::map_as_slice; -use crate::prelude::SeriesUdf; +use crate::prelude::ColumnsUdf; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Debug, Eq, Hash)] @@ -37,7 +37,7 @@ impl Display for BusinessFunction { write!(f, "{s}") } } -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: BusinessFunction) -> Self { use BusinessFunction::*; match func { @@ -62,23 +62,36 @@ impl From for SpecialEq> { #[cfg(feature = "business")] pub(super) fn business_day_count( - s: &[Series], + s: &[Column], week_mask: [bool; 7], holidays: &[i32], -) -> PolarsResult { +) -> PolarsResult { let start = &s[0]; let end = &s[1]; - polars_ops::prelude::business_day_count(start, end, week_mask, holidays) + polars_ops::prelude::business_day_count( + start.as_materialized_series(), + end.as_materialized_series(), + week_mask, + holidays, + ) + .map(Column::from) } #[cfg(feature = "business")] pub(super) fn add_business_days( - s: &[Series], + s: &[Column], week_mask: [bool; 7], holidays: &[i32], roll: Roll, -) -> PolarsResult { +) -> PolarsResult { let start = &s[0]; let n = &s[1]; - polars_ops::prelude::add_business_days(start, n, week_mask, holidays, roll) + polars_ops::prelude::add_business_days( + start.as_materialized_series(), + n.as_materialized_series(), + week_mask, + holidays, + roll, + ) + .map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/cat.rs b/crates/polars-plan/src/dsl/function_expr/cat.rs index 9cc5d993a638..b25215589789 100644 --- a/crates/polars-plan/src/dsl/function_expr/cat.rs +++ b/crates/polars-plan/src/dsl/function_expr/cat.rs @@ -26,7 +26,7 @@ impl Display for CategoricalFunction { } } -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: CategoricalFunction) -> Self { use CategoricalFunction::*; match func { @@ -41,10 +41,10 @@ impl From for FunctionExpr { } } -fn get_categories(s: &Series) -> PolarsResult { +fn get_categories(s: &Column) -> PolarsResult { // categorical check let ca = s.categorical()?; let rev_map = ca.get_rev_map(); let arr = rev_map.get_categories().clone().boxed(); - Series::try_from((ca.name().clone(), arr)) + Series::try_from((ca.name().clone(), arr)).map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/clip.rs b/crates/polars-plan/src/dsl/function_expr/clip.rs index adae248a8af2..4b537c811235 100644 --- a/crates/polars-plan/src/dsl/function_expr/clip.rs +++ b/crates/polars-plan/src/dsl/function_expr/clip.rs @@ -1,10 +1,21 @@ use super::*; -pub(super) fn clip(s: &[Series], has_min: bool, has_max: bool) -> PolarsResult { +pub(super) fn clip(s: &[Column], has_min: bool, has_max: bool) -> PolarsResult { match (has_min, has_max) { - (true, true) => polars_ops::series::clip(&s[0], &s[1], &s[2]), - (true, false) => polars_ops::series::clip_min(&s[0], &s[1]), - (false, true) => polars_ops::series::clip_max(&s[0], &s[1]), + (true, true) => polars_ops::series::clip( + &s[0].as_materialized_series(), + &s[1].as_materialized_series(), + &s[2].as_materialized_series(), + ), + (true, false) => polars_ops::series::clip_min( + &s[0].as_materialized_series(), + &s[1].as_materialized_series(), + ), + (false, true) => polars_ops::series::clip_max( + &s[0].as_materialized_series(), + &s[1].as_materialized_series(), + ), _ => unreachable!(), } + .map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/coerce.rs b/crates/polars-plan/src/dsl/function_expr/coerce.rs index 652866491edb..bd03ede32c84 100644 --- a/crates/polars-plan/src/dsl/function_expr/coerce.rs +++ b/crates/polars-plan/src/dsl/function_expr/coerce.rs @@ -1,5 +1,5 @@ use polars_core::prelude::*; -pub fn as_struct(s: &[Series]) -> PolarsResult { - Ok(StructChunked::from_series(s[0].name().clone(), s)?.into_series()) +pub fn as_struct(s: &[Column]) -> PolarsResult { + Ok(StructChunked::from_columns(s[0].name().clone(), s)?.into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/concat.rs b/crates/polars-plan/src/dsl/function_expr/concat.rs index 3c069fd90bf8..a021545f2ad0 100644 --- a/crates/polars-plan/src/dsl/function_expr/concat.rs +++ b/crates/polars-plan/src/dsl/function_expr/concat.rs @@ -1,6 +1,6 @@ use super::*; -pub(super) fn concat_expr(s: &[Series], rechunk: bool) -> PolarsResult { +pub(super) fn concat_expr(s: &[Column], rechunk: bool) -> PolarsResult { let mut first = s[0].clone(); for s in &s[1..] { diff --git a/crates/polars-plan/src/dsl/function_expr/correlation.rs b/crates/polars-plan/src/dsl/function_expr/correlation.rs index 216a635ba475..14b3f0f77a4c 100644 --- a/crates/polars-plan/src/dsl/function_expr/correlation.rs +++ b/crates/polars-plan/src/dsl/function_expr/correlation.rs @@ -25,7 +25,7 @@ impl Display for CorrelationMethod { } } -pub(super) fn corr(s: &[Series], ddof: u8, method: CorrelationMethod) -> PolarsResult { +pub(super) fn corr(s: &[Column], ddof: u8, method: CorrelationMethod) -> PolarsResult { match method { CorrelationMethod::Pearson => pearson_corr(s, ddof), #[cfg(all(feature = "rank", feature = "propagate_nans"))] @@ -36,7 +36,7 @@ pub(super) fn corr(s: &[Series], ddof: u8, method: CorrelationMethod) -> PolarsR } } -fn covariance(s: &[Series], ddof: u8) -> PolarsResult { +fn covariance(s: &[Column], ddof: u8) -> PolarsResult { let a = &s[0]; let b = &s[1]; let name = PlSmallStr::from_static("cov"); @@ -45,7 +45,7 @@ fn covariance(s: &[Series], ddof: u8) -> PolarsResult { let ret = match a.dtype() { DataType::Float32 => { let ret = cov(a.f32().unwrap(), b.f32().unwrap(), ddof).map(|v| v as f32); - return Ok(Series::new(name, &[ret])); + return Ok(Column::new(name, &[ret])); }, DataType::Float64 => cov(a.f64().unwrap(), b.f64().unwrap(), ddof), DataType::Int32 => cov(a.i32().unwrap(), b.i32().unwrap(), ddof), @@ -58,10 +58,10 @@ fn covariance(s: &[Series], ddof: u8) -> PolarsResult { cov(a.f64().unwrap(), b.f64().unwrap(), ddof) }, }; - Ok(Series::new(name, &[ret])) + Ok(Column::new(name, &[ret])) } -fn pearson_corr(s: &[Series], ddof: u8) -> PolarsResult { +fn pearson_corr(s: &[Column], ddof: u8) -> PolarsResult { let a = &s[0]; let b = &s[1]; let name = PlSmallStr::from_static("pearson_corr"); @@ -70,7 +70,7 @@ fn pearson_corr(s: &[Series], ddof: u8) -> PolarsResult { let ret = match a.dtype() { DataType::Float32 => { let ret = pearson_corr(a.f32().unwrap(), b.f32().unwrap(), ddof).map(|v| v as f32); - return Ok(Series::new(name.clone(), &[ret])); + return Ok(Column::new(name.clone(), &[ret])); }, DataType::Float64 => pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof), DataType::Int32 => pearson_corr(a.i32().unwrap(), b.i32().unwrap(), ddof), @@ -82,29 +82,29 @@ fn pearson_corr(s: &[Series], ddof: u8) -> PolarsResult { pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof) }, }; - Ok(Series::new(name, &[ret])) + Ok(Column::new(name, &[ret])) } #[cfg(all(feature = "rank", feature = "propagate_nans"))] -fn spearman_rank_corr(s: &[Series], ddof: u8, propagate_nans: bool) -> PolarsResult { - use polars_core::utils::coalesce_nulls_series; +fn spearman_rank_corr(s: &[Column], ddof: u8, propagate_nans: bool) -> PolarsResult { + use polars_core::utils::coalesce_nulls_columns; use polars_ops::chunked_array::nan_propagating_aggregate::nan_max_s; let a = &s[0]; let b = &s[1]; - let (a, b) = coalesce_nulls_series(a, b); + let (a, b) = coalesce_nulls_columns(a, b); let name = PlSmallStr::from_static("spearman_rank_correlation"); if propagate_nans && a.dtype().is_float() { for s in [&a, &b] { - if nan_max_s(s, PlSmallStr::EMPTY) + if nan_max_s(s.as_materialized_series(), PlSmallStr::EMPTY) .get(0) .unwrap() .extract::() .unwrap() .is_nan() { - return Ok(Series::new(name, &[f64::NAN])); + return Ok(Column::new(name, &[f64::NAN])); } } } @@ -113,20 +113,20 @@ fn spearman_rank_corr(s: &[Series], ddof: u8, propagate_nans: bool) -> PolarsRes let a = a.drop_nulls(); let b = b.drop_nulls(); - let a_rank = a.rank( + let a_rank = a.as_materialized_series().rank( RankOptions { method: RankMethod::Average, ..Default::default() }, None, - ); - let b_rank = b.rank( + ).into(); + let b_rank = b.as_materialized_series().rank( RankOptions { method: RankMethod::Average, ..Default::default() }, None, - ); + ).into(); pearson_corr(&[a_rank, b_rank], ddof) } diff --git a/crates/polars-plan/src/dsl/function_expr/cum.rs b/crates/polars-plan/src/dsl/function_expr/cum.rs index 74ad6eec596a..755199c3a2a0 100644 --- a/crates/polars-plan/src/dsl/function_expr/cum.rs +++ b/crates/polars-plan/src/dsl/function_expr/cum.rs @@ -1,23 +1,28 @@ use super::*; -pub(super) fn cum_count(s: &Series, reverse: bool) -> PolarsResult { - polars_ops::prelude::cum_count(s, reverse) +pub(super) fn cum_count(s: &Column, reverse: bool) -> PolarsResult { + // @scalar-opt + polars_ops::prelude::cum_count(s.as_materialized_series(), reverse).map(Column::from) } -pub(super) fn cum_sum(s: &Series, reverse: bool) -> PolarsResult { - polars_ops::prelude::cum_sum(s, reverse) +pub(super) fn cum_sum(s: &Column, reverse: bool) -> PolarsResult { + // @scalar-opt + polars_ops::prelude::cum_sum(s.as_materialized_series(), reverse).map(Column::from) } -pub(super) fn cum_prod(s: &Series, reverse: bool) -> PolarsResult { - polars_ops::prelude::cum_prod(s, reverse) +pub(super) fn cum_prod(s: &Column, reverse: bool) -> PolarsResult { + // @scalar-opt + polars_ops::prelude::cum_prod(s.as_materialized_series(), reverse).map(Column::from) } -pub(super) fn cum_min(s: &Series, reverse: bool) -> PolarsResult { - polars_ops::prelude::cum_min(s, reverse) +pub(super) fn cum_min(s: &Column, reverse: bool) -> PolarsResult { + // @scalar-opt + polars_ops::prelude::cum_min(s.as_materialized_series(), reverse).map(Column::from) } -pub(super) fn cum_max(s: &Series, reverse: bool) -> PolarsResult { - polars_ops::prelude::cum_max(s, reverse) +pub(super) fn cum_max(s: &Column, reverse: bool) -> PolarsResult { + // @scalar-opt + polars_ops::prelude::cum_max(s.as_materialized_series(), reverse).map(Column::from) } pub(super) mod dtypes { diff --git a/crates/polars-plan/src/dsl/function_expr/datetime.rs b/crates/polars-plan/src/dsl/function_expr/datetime.rs index 1d1d6a5022e4..9dbde708e7bf 100644 --- a/crates/polars-plan/src/dsl/function_expr/datetime.rs +++ b/crates/polars-plan/src/dsl/function_expr/datetime.rs @@ -196,40 +196,56 @@ impl Display for TemporalFunction { } } -pub(super) fn millennium(s: &Series) -> PolarsResult { - s.millennium().map(|ca| ca.into_series()) -} -pub(super) fn century(s: &Series) -> PolarsResult { - s.century().map(|ca| ca.into_series()) -} -pub(super) fn year(s: &Series) -> PolarsResult { - s.year().map(|ca| ca.into_series()) -} -pub(super) fn is_leap_year(s: &Series) -> PolarsResult { - s.is_leap_year().map(|ca| ca.into_series()) -} -pub(super) fn iso_year(s: &Series) -> PolarsResult { - s.iso_year().map(|ca| ca.into_series()) -} -pub(super) fn month(s: &Series) -> PolarsResult { - s.month().map(|ca| ca.into_series()) -} -pub(super) fn quarter(s: &Series) -> PolarsResult { - s.quarter().map(|ca| ca.into_series()) -} -pub(super) fn week(s: &Series) -> PolarsResult { - s.week().map(|ca| ca.into_series()) -} -pub(super) fn weekday(s: &Series) -> PolarsResult { - s.weekday().map(|ca| ca.into_series()) -} -pub(super) fn day(s: &Series) -> PolarsResult { - s.day().map(|ca| ca.into_series()) -} -pub(super) fn ordinal_day(s: &Series) -> PolarsResult { - s.ordinal_day().map(|ca| ca.into_series()) -} -pub(super) fn time(s: &Series) -> PolarsResult { +pub(super) fn millennium(s: &Column) -> PolarsResult { + s.as_materialized_series() + .millennium() + .map(|ca| ca.into_column()) +} +pub(super) fn century(s: &Column) -> PolarsResult { + s.as_materialized_series() + .century() + .map(|ca| ca.into_column()) +} +pub(super) fn year(s: &Column) -> PolarsResult { + s.as_materialized_series().year().map(|ca| ca.into_column()) +} +pub(super) fn is_leap_year(s: &Column) -> PolarsResult { + s.as_materialized_series() + .is_leap_year() + .map(|ca| ca.into_column()) +} +pub(super) fn iso_year(s: &Column) -> PolarsResult { + s.as_materialized_series() + .iso_year() + .map(|ca| ca.into_column()) +} +pub(super) fn month(s: &Column) -> PolarsResult { + s.as_materialized_series() + .month() + .map(|ca| ca.into_column()) +} +pub(super) fn quarter(s: &Column) -> PolarsResult { + s.as_materialized_series() + .quarter() + .map(|ca| ca.into_column()) +} +pub(super) fn week(s: &Column) -> PolarsResult { + s.as_materialized_series().week().map(|ca| ca.into_column()) +} +pub(super) fn weekday(s: &Column) -> PolarsResult { + s.as_materialized_series() + .weekday() + .map(|ca| ca.into_column()) +} +pub(super) fn day(s: &Column) -> PolarsResult { + s.as_materialized_series().day().map(|ca| ca.into_column()) +} +pub(super) fn ordinal_day(s: &Column) -> PolarsResult { + s.as_materialized_series() + .ordinal_day() + .map(|ca| ca.into_column()) +} +pub(super) fn time(s: &Column) -> PolarsResult { match s.dtype() { #[cfg(feature = "timezones")] DataType::Datetime(_, Some(_)) => polars_ops::prelude::replace_time_zone( @@ -238,13 +254,19 @@ pub(super) fn time(s: &Series) -> PolarsResult { &StringChunked::from_iter(std::iter::once("raise")), NonExistent::Raise, )? - .cast(&DataType::Time), - DataType::Datetime(_, _) => s.datetime().unwrap().cast(&DataType::Time), + .cast(&DataType::Time) + .map(Column::from), + DataType::Datetime(_, _) => s + .datetime() + .unwrap() + .cast(&DataType::Time) + .map(Column::from), DataType::Time => Ok(s.clone()), dtype => polars_bail!(ComputeError: "expected Datetime or Time, got {}", dtype), } + .map(Column::from) } -pub(super) fn date(s: &Series) -> PolarsResult { +pub(super) fn date(s: &Column) -> PolarsResult { match s.dtype() { #[cfg(feature = "timezones")] DataType::Datetime(_, Some(tz)) => { @@ -261,14 +283,18 @@ pub(super) fn date(s: &Series) -> PolarsResult { // DST transitions may not preserve sortedness. out.set_sorted_flag(IsSorted::Not); } - Ok(out) + Ok(out.into()) }, - DataType::Datetime(_, _) => s.datetime().unwrap().cast(&DataType::Date), + DataType::Datetime(_, _) => s + .datetime() + .unwrap() + .cast(&DataType::Date) + .map(Column::from), DataType::Date => Ok(s.clone()), dtype => polars_bail!(ComputeError: "expected Datetime or Date, got {}", dtype), } } -pub(super) fn datetime(s: &Series) -> PolarsResult { +pub(super) fn datetime(s: &Column) -> PolarsResult { match s.dtype() { #[cfg(feature = "timezones")] DataType::Datetime(tu, Some(tz)) => { @@ -285,111 +311,139 @@ pub(super) fn datetime(s: &Series) -> PolarsResult { // DST transitions may not preserve sortedness. out.set_sorted_flag(IsSorted::Not); } - Ok(out) + Ok(out.into()) }, - DataType::Datetime(tu, _) => s.datetime().unwrap().cast(&DataType::Datetime(*tu, None)), + DataType::Datetime(tu, _) => s + .datetime() + .unwrap() + .cast(&DataType::Datetime(*tu, None)) + .map(Column::from), dtype => polars_bail!(ComputeError: "expected Datetime, got {}", dtype), } } -pub(super) fn hour(s: &Series) -> PolarsResult { - s.hour().map(|ca| ca.into_series()) +pub(super) fn hour(s: &Column) -> PolarsResult { + s.as_materialized_series().hour().map(|ca| ca.into_column()) } -pub(super) fn minute(s: &Series) -> PolarsResult { - s.minute().map(|ca| ca.into_series()) +pub(super) fn minute(s: &Column) -> PolarsResult { + s.as_materialized_series() + .minute() + .map(|ca| ca.into_column()) } -pub(super) fn second(s: &Series) -> PolarsResult { - s.second().map(|ca| ca.into_series()) +pub(super) fn second(s: &Column) -> PolarsResult { + s.as_materialized_series() + .second() + .map(|ca| ca.into_column()) } -pub(super) fn millisecond(s: &Series) -> PolarsResult { - s.nanosecond() - .map(|ca| (ca.wrapping_trunc_div_scalar(1_000_000)).into_series()) +pub(super) fn millisecond(s: &Column) -> PolarsResult { + s.as_materialized_series() + .nanosecond() + .map(|ca| (ca.wrapping_trunc_div_scalar(1_000_000)).into_column()) } -pub(super) fn microsecond(s: &Series) -> PolarsResult { - s.nanosecond() - .map(|ca| (ca.wrapping_trunc_div_scalar(1_000)).into_series()) +pub(super) fn microsecond(s: &Column) -> PolarsResult { + s.as_materialized_series() + .nanosecond() + .map(|ca| (ca.wrapping_trunc_div_scalar(1_000)).into_column()) } -pub(super) fn nanosecond(s: &Series) -> PolarsResult { - s.nanosecond().map(|ca| ca.into_series()) +pub(super) fn nanosecond(s: &Column) -> PolarsResult { + s.as_materialized_series() + .nanosecond() + .map(|ca| ca.into_column()) } #[cfg(feature = "dtype-duration")] -pub(super) fn total_days(s: &Series) -> PolarsResult { - s.duration().map(|ca| ca.days().into_series()) +pub(super) fn total_days(s: &Column) -> PolarsResult { + s.as_materialized_series() + .duration() + .map(|ca| ca.days().into_column()) } #[cfg(feature = "dtype-duration")] -pub(super) fn total_hours(s: &Series) -> PolarsResult { - s.duration().map(|ca| ca.hours().into_series()) +pub(super) fn total_hours(s: &Column) -> PolarsResult { + s.as_materialized_series() + .duration() + .map(|ca| ca.hours().into_column()) } #[cfg(feature = "dtype-duration")] -pub(super) fn total_minutes(s: &Series) -> PolarsResult { - s.duration().map(|ca| ca.minutes().into_series()) +pub(super) fn total_minutes(s: &Column) -> PolarsResult { + s.as_materialized_series() + .duration() + .map(|ca| ca.minutes().into_column()) } #[cfg(feature = "dtype-duration")] -pub(super) fn total_seconds(s: &Series) -> PolarsResult { - s.duration().map(|ca| ca.seconds().into_series()) +pub(super) fn total_seconds(s: &Column) -> PolarsResult { + s.as_materialized_series() + .duration() + .map(|ca| ca.seconds().into_column()) } #[cfg(feature = "dtype-duration")] -pub(super) fn total_milliseconds(s: &Series) -> PolarsResult { - s.duration().map(|ca| ca.milliseconds().into_series()) +pub(super) fn total_milliseconds(s: &Column) -> PolarsResult { + s.as_materialized_series() + .duration() + .map(|ca| ca.milliseconds().into_column()) } #[cfg(feature = "dtype-duration")] -pub(super) fn total_microseconds(s: &Series) -> PolarsResult { - s.duration().map(|ca| ca.microseconds().into_series()) +pub(super) fn total_microseconds(s: &Column) -> PolarsResult { + s.as_materialized_series() + .duration() + .map(|ca| ca.microseconds().into_column()) } #[cfg(feature = "dtype-duration")] -pub(super) fn total_nanoseconds(s: &Series) -> PolarsResult { - s.duration().map(|ca| ca.nanoseconds().into_series()) +pub(super) fn total_nanoseconds(s: &Column) -> PolarsResult { + s.as_materialized_series() + .duration() + .map(|ca| ca.nanoseconds().into_column()) } -pub(super) fn timestamp(s: &Series, tu: TimeUnit) -> PolarsResult { - s.timestamp(tu).map(|ca| ca.into_series()) +pub(super) fn timestamp(s: &Column, tu: TimeUnit) -> PolarsResult { + s.as_materialized_series() + .timestamp(tu) + .map(|ca| ca.into_column()) } -pub(super) fn to_string(s: &Series, format: &str) -> PolarsResult { - TemporalMethods::to_string(s, format) +pub(super) fn to_string(s: &Column, format: &str) -> PolarsResult { + TemporalMethods::to_string(s.as_materialized_series(), format).map(Column::from) } #[cfg(feature = "timezones")] -pub(super) fn convert_time_zone(s: &Series, time_zone: &TimeZone) -> PolarsResult { +pub(super) fn convert_time_zone(s: &Column, time_zone: &TimeZone) -> PolarsResult { match s.dtype() { DataType::Datetime(_, _) => { let mut ca = s.datetime()?.clone(); validate_time_zone(time_zone)?; ca.set_time_zone(time_zone.clone())?; - Ok(ca.into_series()) + Ok(ca.into_column()) }, dtype => polars_bail!(ComputeError: "expected Datetime, got {}", dtype), } } -pub(super) fn with_time_unit(s: &Series, tu: TimeUnit) -> PolarsResult { +pub(super) fn with_time_unit(s: &Column, tu: TimeUnit) -> PolarsResult { match s.dtype() { DataType::Datetime(_, _) => { let mut ca = s.datetime()?.clone(); ca.set_time_unit(tu); - Ok(ca.into_series()) + Ok(ca.into_column()) }, #[cfg(feature = "dtype-duration")] DataType::Duration(_) => { - let mut ca = s.duration()?.clone(); + let mut ca = s.as_materialized_series().duration()?.clone(); ca.set_time_unit(tu); - Ok(ca.into_series()) + Ok(ca.into_column()) }, dt => polars_bail!(ComputeError: "dtype `{}` has no time unit", dt), } } -pub(super) fn cast_time_unit(s: &Series, tu: TimeUnit) -> PolarsResult { +pub(super) fn cast_time_unit(s: &Column, tu: TimeUnit) -> PolarsResult { match s.dtype() { DataType::Datetime(_, _) => { let ca = s.datetime()?; - Ok(ca.cast_time_unit(tu).into_series()) + Ok(ca.cast_time_unit(tu).into_column()) }, #[cfg(feature = "dtype-duration")] DataType::Duration(_) => { - let ca = s.duration()?; - Ok(ca.cast_time_unit(tu).into_series()) + let ca = s.as_materialized_series().duration()?; + Ok(ca.cast_time_unit(tu).into_column()) }, dt => polars_bail!(ComputeError: "dtype `{}` has no time unit", dt), } } -pub(super) fn truncate(s: &[Series]) -> PolarsResult { +pub(super) fn truncate(s: &[Column]) -> PolarsResult { let time_series = &s[0]; let every = s[1].str()?; @@ -399,10 +453,10 @@ pub(super) fn truncate(s: &[Series]) -> PolarsResult { Some(tz) => time_series .datetime()? .truncate(tz.parse::().ok().as_ref(), every)? - .into_series(), - _ => time_series.datetime()?.truncate(None, every)?.into_series(), + .into_column(), + _ => time_series.datetime()?.truncate(None, every)?.into_column(), }, - DataType::Date => time_series.date()?.truncate(None, every)?.into_series(), + DataType::Date => time_series.date()?.truncate(None, every)?.into_column(), dt => polars_bail!(opq = round, got = dt, expected = "date/datetime"), }; out.set_sorted_flag(time_series.is_sorted_flag()); @@ -410,12 +464,15 @@ pub(super) fn truncate(s: &[Series]) -> PolarsResult { } #[cfg(feature = "offset_by")] -pub(super) fn offset_by(s: &[Series]) -> PolarsResult { - impl_offset_by(&s[0], &s[1]) +pub(super) fn offset_by(s: &[Column]) -> PolarsResult { + impl_offset_by( + &s[0].as_materialized_series(), + &s[1].as_materialized_series(), + ).map(Column::from) } #[cfg(feature = "month_start")] -pub(super) fn month_start(s: &Series) -> PolarsResult { +pub(super) fn month_start(s: &Column) -> PolarsResult { Ok(match s.dtype() { DataType::Datetime(_, tz) => match tz { #[cfg(feature = "timezones")] @@ -423,16 +480,16 @@ pub(super) fn month_start(s: &Series) -> PolarsResult { .datetime() .unwrap() .month_start(tz.parse::().ok().as_ref())? - .into_series(), - _ => s.datetime().unwrap().month_start(None)?.into_series(), + .into_column(), + _ => s.datetime().unwrap().month_start(None)?.into_column(), }, - DataType::Date => s.date().unwrap().month_start(None)?.into_series(), + DataType::Date => s.date().unwrap().month_start(None)?.into_column(), dt => polars_bail!(opq = month_start, got = dt, expected = "date/datetime"), }) } #[cfg(feature = "month_end")] -pub(super) fn month_end(s: &Series) -> PolarsResult { +pub(super) fn month_end(s: &Column) -> PolarsResult { Ok(match s.dtype() { DataType::Datetime(_, tz) => match tz { #[cfg(feature = "timezones")] @@ -440,22 +497,22 @@ pub(super) fn month_end(s: &Series) -> PolarsResult { .datetime() .unwrap() .month_end(tz.parse::().ok().as_ref())? - .into_series(), - _ => s.datetime().unwrap().month_end(None)?.into_series(), + .into_column(), + _ => s.datetime().unwrap().month_end(None)?.into_column(), }, - DataType::Date => s.date().unwrap().month_end(None)?.into_series(), + DataType::Date => s.date().unwrap().month_end(None)?.into_column(), dt => polars_bail!(opq = month_end, got = dt, expected = "date/datetime"), }) } #[cfg(feature = "timezones")] -pub(super) fn base_utc_offset(s: &Series) -> PolarsResult { +pub(super) fn base_utc_offset(s: &Column) -> PolarsResult { match s.dtype() { DataType::Datetime(time_unit, Some(tz)) => { let tz = tz .parse::() .expect("Time zone has already been validated"); - Ok(base_utc_offset_fn(s.datetime().unwrap(), time_unit, &tz).into_series()) + Ok(base_utc_offset_fn(s.datetime().unwrap(), time_unit, &tz).into_column()) }, dt => polars_bail!( opq = base_utc_offset, @@ -465,13 +522,13 @@ pub(super) fn base_utc_offset(s: &Series) -> PolarsResult { } } #[cfg(feature = "timezones")] -pub(super) fn dst_offset(s: &Series) -> PolarsResult { +pub(super) fn dst_offset(s: &Column) -> PolarsResult { match s.dtype() { DataType::Datetime(time_unit, Some(tz)) => { let tz = tz .parse::() .expect("Time zone has already been validated"); - Ok(dst_offset_fn(s.datetime().unwrap(), time_unit, &tz).into_series()) + Ok(dst_offset_fn(s.datetime().unwrap(), time_unit, &tz).into_column()) }, dt => polars_bail!( opq = dst_offset, @@ -481,7 +538,7 @@ pub(super) fn dst_offset(s: &Series) -> PolarsResult { } } -pub(super) fn round(s: &[Series]) -> PolarsResult { +pub(super) fn round(s: &[Column]) -> PolarsResult { let time_series = &s[0]; let every = s[1].str()?; @@ -492,18 +549,18 @@ pub(super) fn round(s: &[Series]) -> PolarsResult { .datetime() .unwrap() .round(every, tz.parse::().ok().as_ref())? - .into_series(), + .into_column(), _ => time_series .datetime() .unwrap() .round(every, None)? - .into_series(), + .into_column(), }, DataType::Date => time_series .date() .unwrap() .round(every, None)? - .into_series(), + .into_column(), dt => polars_bail!(opq = round, got = dt, expected = "date/datetime"), }) } diff --git a/crates/polars-plan/src/dsl/function_expr/dispatch.rs b/crates/polars-plan/src/dsl/function_expr/dispatch.rs index 12275fc57200..06d44a74f382 100644 --- a/crates/polars-plan/src/dsl/function_expr/dispatch.rs +++ b/crates/polars-plan/src/dsl/function_expr/dispatch.rs @@ -1,41 +1,45 @@ use super::*; -pub(super) fn reverse(s: &Series) -> PolarsResult { +pub(super) fn reverse(s: &Column) -> PolarsResult { Ok(s.reverse()) } #[cfg(feature = "approx_unique")] -pub(super) fn approx_n_unique(s: &Series) -> PolarsResult { - polars_ops::prelude::approx_n_unique(s) +pub(super) fn approx_n_unique(s: &Column) -> PolarsResult { + polars_ops::prelude::approx_n_unique(s.as_materialized_series()).map(Column::from) } #[cfg(feature = "diff")] -pub(super) fn diff(s: &Series, n: i64, null_behavior: NullBehavior) -> PolarsResult { - polars_ops::prelude::diff(s, n, null_behavior) +pub(super) fn diff(s: &Column, n: i64, null_behavior: NullBehavior) -> PolarsResult { + polars_ops::prelude::diff(s.as_materialized_series(), n, null_behavior).map(Column::from) } #[cfg(feature = "pct_change")] -pub(super) fn pct_change(s: &[Series]) -> PolarsResult { - polars_ops::prelude::pct_change(&s[0], &s[1]) +pub(super) fn pct_change(s: &[Column]) -> PolarsResult { + polars_ops::prelude::pct_change( + &s[0].as_materialized_series(), + &s[1].as_materialized_series(), + ) + .map(Column::from) } #[cfg(feature = "interpolate")] -pub(super) fn interpolate(s: &Series, method: InterpolationMethod) -> PolarsResult { - Ok(polars_ops::prelude::interpolate(s, method)) +pub(super) fn interpolate(s: &Column, method: InterpolationMethod) -> PolarsResult { + Ok(polars_ops::prelude::interpolate(s.as_materialized_series(), method).into()) } #[cfg(feature = "interpolate_by")] -pub(super) fn interpolate_by(s: &[Series]) -> PolarsResult { +pub(super) fn interpolate_by(s: &[Column]) -> PolarsResult { let by = &s[1]; - let by_is_sorted = by.is_sorted(Default::default())?; + let by_is_sorted = by.as_materialized_series().is_sorted(Default::default())?; polars_ops::prelude::interpolate_by(&s[0], by, by_is_sorted) } -pub(super) fn to_physical(s: &Series) -> PolarsResult { - Ok(s.to_physical_repr().into_owned()) +pub(super) fn to_physical(s: &Column) -> PolarsResult { + Ok(s.to_physical_repr()) } -pub(super) fn set_sorted_flag(s: &Series, sorted: IsSorted) -> PolarsResult { +pub(super) fn set_sorted_flag(s: &Column, sorted: IsSorted) -> PolarsResult { let mut s = s.clone(); s.set_sorted_flag(sorted); Ok(s) @@ -43,34 +47,35 @@ pub(super) fn set_sorted_flag(s: &Series, sorted: IsSorted) -> PolarsResult, non_existent: NonExistent, -) -> PolarsResult { +) -> PolarsResult { let s1 = &s[0]; let ca = s1.datetime().unwrap(); let s2 = &s[1].str()?; - Ok(polars_ops::prelude::replace_time_zone(ca, time_zone, s2, non_existent)?.into_series()) + Ok(polars_ops::prelude::replace_time_zone(ca, time_zone, s2, non_existent)?.into_column()) } #[cfg(feature = "dtype-struct")] pub(super) fn value_counts( - s: &Series, + s: &Column, sort: bool, parallel: bool, name: PlSmallStr, normalize: bool, -) -> PolarsResult { - s.value_counts(sort, parallel, name, normalize) - .map(|df| df.into_struct(s.name().clone()).into_series()) +) -> PolarsResult { + s.as_materialized_series() + .value_counts(sort, parallel, name, normalize) + .map(|df| df.into_struct(s.name().clone()).into_column()) } #[cfg(feature = "unique_counts")] -pub(super) fn unique_counts(s: &Series) -> PolarsResult { - polars_ops::prelude::unique_counts(s) +pub(super) fn unique_counts(s: &Column) -> PolarsResult { + polars_ops::prelude::unique_counts(s.as_materialized_series()).map(Column::from) } -pub(super) fn reshape(s: &Series, dimensions: &[i64], nested: &NestedType) -> PolarsResult { +pub(super) fn reshape(s: &Column, dimensions: &[i64], nested: &NestedType) -> PolarsResult { match nested { NestedType::List => s.reshape_list(dimensions), #[cfg(feature = "dtype-array")] @@ -79,120 +84,150 @@ pub(super) fn reshape(s: &Series, dimensions: &[i64], nested: &NestedType) -> Po } #[cfg(feature = "repeat_by")] -pub(super) fn repeat_by(s: &[Series]) -> PolarsResult { +pub(super) fn repeat_by(s: &[Column]) -> PolarsResult { let by = &s[1]; let s = &s[0]; let by = by.cast(&IDX_DTYPE)?; - polars_ops::chunked_array::repeat_by(s, by.idx()?).map(|ok| ok.into_series()) + polars_ops::chunked_array::repeat_by(s.as_materialized_series(), by.idx()?) + .map(|ok| ok.into_column()) } -pub(super) fn backward_fill(s: &Series, limit: FillNullLimit) -> PolarsResult { +pub(super) fn backward_fill(s: &Column, limit: FillNullLimit) -> PolarsResult { s.fill_null(FillNullStrategy::Backward(limit)) } -pub(super) fn forward_fill(s: &Series, limit: FillNullLimit) -> PolarsResult { +pub(super) fn forward_fill(s: &Column, limit: FillNullLimit) -> PolarsResult { s.fill_null(FillNullStrategy::Forward(limit)) } -pub(super) fn max_horizontal(s: &mut [Series]) -> PolarsResult> { +pub(super) fn max_horizontal(s: &mut [Column]) -> PolarsResult> { polars_ops::prelude::max_horizontal(s) } -pub(super) fn min_horizontal(s: &mut [Series]) -> PolarsResult> { +pub(super) fn min_horizontal(s: &mut [Column]) -> PolarsResult> { polars_ops::prelude::min_horizontal(s) } -pub(super) fn sum_horizontal(s: &mut [Series]) -> PolarsResult> { +pub(super) fn sum_horizontal(s: &mut [Column]) -> PolarsResult> { polars_ops::prelude::sum_horizontal(s) } -pub(super) fn mean_horizontal(s: &mut [Series]) -> PolarsResult> { +pub(super) fn mean_horizontal(s: &mut [Column]) -> PolarsResult> { polars_ops::prelude::mean_horizontal(s) } -pub(super) fn drop_nulls(s: &Series) -> PolarsResult { +pub(super) fn drop_nulls(s: &Column) -> PolarsResult { Ok(s.drop_nulls()) } #[cfg(feature = "mode")] -pub(super) fn mode(s: &Series) -> PolarsResult { +pub(super) fn mode(s: &Column) -> PolarsResult { mode::mode(s) } #[cfg(feature = "moment")] -pub(super) fn skew(s: &Series, bias: bool) -> PolarsResult { - s.skew(bias) - .map(|opt_v| Series::new(s.name().clone(), &[opt_v])) +pub(super) fn skew(s: &Column, bias: bool) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .skew(bias) + .map(|opt_v| Column::new(s.name().clone(), &[opt_v])) } #[cfg(feature = "moment")] -pub(super) fn kurtosis(s: &Series, fisher: bool, bias: bool) -> PolarsResult { - s.kurtosis(fisher, bias) - .map(|opt_v| Series::new(s.name().clone(), &[opt_v])) +pub(super) fn kurtosis(s: &Column, fisher: bool, bias: bool) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .kurtosis(fisher, bias) + .map(|opt_v| Column::new(s.name().clone(), &[opt_v])) } -pub(super) fn arg_unique(s: &Series) -> PolarsResult { - s.arg_unique().map(|ok| ok.into_series()) +pub(super) fn arg_unique(s: &Column) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .arg_unique() + .map(|ok| ok.into_column()) } #[cfg(feature = "rank")] -pub(super) fn rank(s: &Series, options: RankOptions, seed: Option) -> PolarsResult { - Ok(s.rank(options, seed)) +pub(super) fn rank(s: &Column, options: RankOptions, seed: Option) -> PolarsResult { + Ok(s.as_materialized_series().rank(options, seed).into_column()) } #[cfg(feature = "hist")] pub(super) fn hist( - s: &[Series], + s: &[Column], bin_count: Option, include_category: bool, include_breakpoint: bool, -) -> PolarsResult { +) -> PolarsResult { let bins = if s.len() == 2 { - Some(s[1].clone()) + Some(&s[1]) } else { None }; - let s = &s[0]; - hist_series(s, bin_count, bins, include_category, include_breakpoint) + let s = s[0].as_materialized_series(); + hist_series( + s, + bin_count, + bins.map(|b| b.as_materialized_series().clone()), + include_category, + include_breakpoint, + ) + .map(Column::from) } #[cfg(feature = "replace")] -pub(super) fn replace(s: &[Series]) -> PolarsResult { - polars_ops::series::replace(&s[0], &s[1], &s[2]) +pub(super) fn replace(s: &[Column]) -> PolarsResult { + polars_ops::series::replace( + &s[0].as_materialized_series(), + &s[1].as_materialized_series(), + &s[2].as_materialized_series(), + ) + .map(Column::from) } #[cfg(feature = "replace")] -pub(super) fn replace_strict(s: &[Series], return_dtype: Option) -> PolarsResult { +pub(super) fn replace_strict(s: &[Column], return_dtype: Option) -> PolarsResult { match s.get(3) { - Some(default) => { - polars_ops::series::replace_or_default(&s[0], &s[1], &s[2], default, return_dtype) - }, - None => polars_ops::series::replace_strict(&s[0], &s[1], &s[2], return_dtype), + Some(default) => polars_ops::series::replace_or_default( + &s[0].as_materialized_series(), + &s[1].as_materialized_series(), + &s[2].as_materialized_series(), + default.as_materialized_series(), + return_dtype, + ), + None => polars_ops::series::replace_strict( + &s[0].as_materialized_series(), + &s[1].as_materialized_series(), + &s[2].as_materialized_series(), + return_dtype, + ), } + .map(Column::from) } pub(super) fn fill_null_with_strategy( - s: &Series, + s: &Column, strategy: FillNullStrategy, -) -> PolarsResult { +) -> PolarsResult { s.fill_null(strategy) } -pub(super) fn gather_every(s: &Series, n: usize, offset: usize) -> PolarsResult { +pub(super) fn gather_every(s: &Column, n: usize, offset: usize) -> PolarsResult { polars_ensure!(n > 0, InvalidOperation: "gather_every(n): n should be positive"); Ok(s.gather_every(n, offset)) } #[cfg(feature = "reinterpret")] -pub(super) fn reinterpret(s: &Series, signed: bool) -> PolarsResult { - polars_ops::series::reinterpret(s, signed) +pub(super) fn reinterpret(s: &Column, signed: bool) -> PolarsResult { + polars_ops::series::reinterpret(s.as_materialized_series(), signed).map(Column::from) } -pub(super) fn negate(s: &Series) -> PolarsResult { - polars_ops::series::negate(s) +pub(super) fn negate(s: &Column) -> PolarsResult { + polars_ops::series::negate(s.as_materialized_series()).map(Column::from) } -pub(super) fn extend_constant(s: &[Series]) -> PolarsResult { +pub(super) fn extend_constant(s: &[Column]) -> PolarsResult { let value = &s[1]; let n = &s[2]; polars_ensure!(value.len() == 1 && n.len() == 1, ComputeError: "value and n should have unit length."); diff --git a/crates/polars-plan/src/dsl/function_expr/ewm.rs b/crates/polars-plan/src/dsl/function_expr/ewm.rs index b824ca3013e9..6f7a20045503 100644 --- a/crates/polars-plan/src/dsl/function_expr/ewm.rs +++ b/crates/polars-plan/src/dsl/function_expr/ewm.rs @@ -1,13 +1,13 @@ use super::*; -pub(super) fn ewm_mean(s: &Series, options: EWMOptions) -> PolarsResult { - polars_ops::prelude::ewm_mean(s, options) +pub(super) fn ewm_mean(s: &Column, options: EWMOptions) -> PolarsResult { + polars_ops::prelude::ewm_mean(s.as_materialized_series(), options).map(Column::from) } -pub(super) fn ewm_std(s: &Series, options: EWMOptions) -> PolarsResult { - polars_ops::prelude::ewm_std(s, options) +pub(super) fn ewm_std(s: &Column, options: EWMOptions) -> PolarsResult { + polars_ops::prelude::ewm_std(s.as_materialized_series(), options).map(Column::from) } -pub(super) fn ewm_var(s: &Series, options: EWMOptions) -> PolarsResult { - polars_ops::prelude::ewm_var(s, options) +pub(super) fn ewm_var(s: &Column, options: EWMOptions) -> PolarsResult { + polars_ops::prelude::ewm_var(s.as_materialized_series(), options).map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/ewm_by.rs b/crates/polars-plan/src/dsl/function_expr/ewm_by.rs index c901dc22a25f..adfc66a01524 100644 --- a/crates/polars-plan/src/dsl/function_expr/ewm_by.rs +++ b/crates/polars-plan/src/dsl/function_expr/ewm_by.rs @@ -2,7 +2,7 @@ use polars_ops::series::SeriesMethods; use super::*; -pub(super) fn ewm_mean_by(s: &[Series], half_life: Duration) -> PolarsResult { +pub(super) fn ewm_mean_by(s: &[Column], half_life: Duration) -> PolarsResult { let time_zone = match s[1].dtype() { DataType::Datetime(_, Some(time_zone)) => Some(time_zone.as_str()), _ => None, @@ -13,6 +13,14 @@ pub(super) fn ewm_mean_by(s: &[Series], half_life: Duration) -> PolarsResult PolarsResult { +pub(super) fn fill_null(s: &[Column]) -> PolarsResult { let series = s[0].clone(); - let fill_value = s[1].clone(); // Nothing to fill, so return early // this is done after casting as the output type must be correct @@ -10,8 +9,10 @@ pub(super) fn fill_null(s: &[Series]) -> PolarsResult { return Ok(series); } + let fill_value = s[1].clone(); + // default branch - fn default(series: Series, fill_value: Series) -> PolarsResult { + fn default(series: Column, fill_value: Column) -> PolarsResult { let mask = series.is_not_null(); series.zip_with_same_type(&mask, &fill_value) } @@ -28,7 +29,7 @@ pub(super) fn fill_null(s: &[Series]) -> PolarsResult { let cats = series.to_physical_repr(); let mask = cats.is_not_null(); let out = cats - .zip_with_same_type(&mask, &Series::new(PlSmallStr::EMPTY, &[idx])) + .zip_with_same_type(&mask, &Column::new(PlSmallStr::EMPTY, &[idx])) .unwrap(); unsafe { return out.cast_unchecked(series.dtype()) } } @@ -46,6 +47,6 @@ pub(super) fn fill_null(s: &[Series]) -> PolarsResult { } } -pub(super) fn coalesce(s: &mut [Series]) -> PolarsResult { +pub(super) fn coalesce(s: &mut [Column]) -> PolarsResult { coalesce_columns(s) } diff --git a/crates/polars-plan/src/dsl/function_expr/fused.rs b/crates/polars-plan/src/dsl/function_expr/fused.rs index a95ac809ebc7..088078105216 100644 --- a/crates/polars-plan/src/dsl/function_expr/fused.rs +++ b/crates/polars-plan/src/dsl/function_expr/fused.rs @@ -22,13 +22,13 @@ impl Display for FusedOperator { } } -pub(super) fn fused(input: &[Series], op: FusedOperator) -> PolarsResult { +pub(super) fn fused(input: &[Column], op: FusedOperator) -> PolarsResult { let s0 = &input[0]; let s1 = &input[1]; let s2 = &input[2]; match op { - FusedOperator::MultiplyAdd => Ok(fma_series(s0, s1, s2)), - FusedOperator::SubMultiply => Ok(fsm_series(s0, s1, s2)), - FusedOperator::MultiplySub => Ok(fms_series(s0, s1, s2)), + FusedOperator::MultiplyAdd => Ok(fma_columns(s0, s1, s2)), + FusedOperator::SubMultiply => Ok(fsm_columns(s0, s1, s2)), + FusedOperator::MultiplySub => Ok(fms_columns(s0, s1, s2)), } } diff --git a/crates/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs index 05df577ed8f3..c2badaecea8d 100644 --- a/crates/polars-plan/src/dsl/function_expr/list.rs +++ b/crates/polars-plan/src/dsl/function_expr/list.rs @@ -179,7 +179,7 @@ impl Display for ListFunction { } } -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: ListFunction) -> Self { use ListFunction::*; match func { @@ -240,49 +240,55 @@ impl From for SpecialEq> { } #[cfg(feature = "is_in")] -pub(super) fn contains(args: &mut [Series]) -> PolarsResult> { +pub(super) fn contains(args: &mut [Column]) -> PolarsResult> { let list = &args[0]; let item = &args[1]; polars_ensure!(matches!(list.dtype(), DataType::List(_)), SchemaMismatch: "invalid series dtype: expected `List`, got `{}`", list.dtype(), ); - polars_ops::prelude::is_in(item, list).map(|mut ca| { - ca.rename(list.name().clone()); - Some(ca.into_series()) - }) + polars_ops::prelude::is_in(item.as_materialized_series(), list.as_materialized_series()).map( + |mut ca| { + ca.rename(list.name().clone()); + Some(ca.into_column()) + }, + ) } #[cfg(feature = "list_drop_nulls")] -pub(super) fn drop_nulls(s: &Series) -> PolarsResult { +pub(super) fn drop_nulls(s: &Column) -> PolarsResult { let list = s.list()?; - - Ok(list.lst_drop_nulls().into_series()) + Ok(list.lst_drop_nulls().into_column()) } #[cfg(feature = "list_sample")] pub(super) fn sample_n( - s: &[Series], + s: &[Column], with_replacement: bool, shuffle: bool, seed: Option, -) -> PolarsResult { +) -> PolarsResult { let list = s[0].list()?; let n = &s[1]; - list.lst_sample_n(n, with_replacement, shuffle, seed) - .map(|ok| ok.into_series()) + list.lst_sample_n(n.as_materialized_series(), with_replacement, shuffle, seed) + .map(|ok| ok.into_column()) } #[cfg(feature = "list_sample")] pub(super) fn sample_fraction( - s: &[Series], + s: &[Column], with_replacement: bool, shuffle: bool, seed: Option, -) -> PolarsResult { +) -> PolarsResult { let list = s[0].list()?; let fraction = &s[1]; - list.lst_sample_fraction(fraction, with_replacement, shuffle, seed) - .map(|ok| ok.into_series()) + list.lst_sample_fraction( + fraction.as_materialized_series(), + with_replacement, + shuffle, + seed, + ) + .map(|ok| ok.into_column()) } fn check_slice_arg_shape(slice_len: usize, ca_len: usize, name: &str) -> PolarsResult<()> { @@ -295,14 +301,14 @@ fn check_slice_arg_shape(slice_len: usize, ca_len: usize, name: &str) -> PolarsR Ok(()) } -pub(super) fn shift(s: &[Series]) -> PolarsResult { +pub(super) fn shift(s: &[Column]) -> PolarsResult { let list = s[0].list()?; let periods = &s[1]; - list.lst_shift(periods).map(|ok| ok.into_series()) + list.lst_shift(periods).map(|ok| ok.into_column()) } -pub(super) fn slice(args: &mut [Series]) -> PolarsResult> { +pub(super) fn slice(args: &mut [Column]) -> PolarsResult> { let s = &args[0]; let list_ca = s.list()?; let offset_s = &args[1]; @@ -316,7 +322,7 @@ pub(super) fn slice(args: &mut [Series]) -> PolarsResult> { .unwrap() .extract::() .unwrap_or(usize::MAX); - return Ok(Some(list_ca.lst_slice(offset, slice_len).into_series())); + return Ok(Some(list_ca.lst_slice(offset, slice_len).into_column())); }, (1, length_slice_len) => { check_slice_arg_shape(length_slice_len, list_ca.len(), "length")?; @@ -379,10 +385,10 @@ pub(super) fn slice(args: &mut [Series]) -> PolarsResult> { }, }; out.rename(s.name().clone()); - Ok(Some(out.into_series())) + Ok(Some(out.into_column())) } -pub(super) fn concat(s: &mut [Series]) -> PolarsResult> { +pub(super) fn concat(s: &mut [Column]) -> PolarsResult> { let mut first = std::mem::take(&mut s[0]); let other = &s[1..]; @@ -402,10 +408,10 @@ pub(super) fn concat(s: &mut [Series]) -> PolarsResult> { } } - first_ca.lst_concat(other).map(|ca| Some(ca.into_series())) + first_ca.lst_concat(other).map(|ca| Some(ca.into_column())) } -pub(super) fn get(s: &mut [Series], null_on_oob: bool) -> PolarsResult> { +pub(super) fn get(s: &mut [Column], null_on_oob: bool) -> PolarsResult> { let ca = s[0].list()?; let index = s[1].cast(&DataType::Int64)?; let index = index.i64().unwrap(); @@ -414,9 +420,9 @@ pub(super) fn get(s: &mut [Series], null_on_oob: bool) -> PolarsResult { let index = index.get(0); if let Some(index) = index { - ca.lst_get(index, null_on_oob).map(Some) + ca.lst_get(index, null_on_oob).map(Column::from).map(Some) } else { - Ok(Some(Series::full_null( + Ok(Some(Column::full_null( ca.name().clone(), ca.len(), ca.inner_dtype(), @@ -478,6 +484,7 @@ pub(super) fn get(s: &mut [Series], null_on_oob: bool) -> PolarsResult polars_bail!( @@ -489,7 +496,7 @@ pub(super) fn get(s: &mut [Series], null_on_oob: bool) -> PolarsResult PolarsResult { +pub(super) fn gather(args: &[Column], null_on_oob: bool) -> PolarsResult { let ca = &args[0]; let idx = &args[1]; let ca = ca.list()?; @@ -497,25 +504,25 @@ pub(super) fn gather(args: &[Series], null_on_oob: bool) -> PolarsResult if idx.len() == 1 && null_on_oob { // fast path let idx = idx.get(0)?.try_extract::()?; - let out = ca.lst_get(idx, null_on_oob)?; + let out = ca.lst_get(idx, null_on_oob).map(Column::from)?; // make sure we return a list out.reshape_list(&[-1, 1]) } else { - ca.lst_gather(idx, null_on_oob) + ca.lst_gather(idx.as_materialized_series(), null_on_oob).map(Column::from) } } #[cfg(feature = "list_gather")] -pub(super) fn gather_every(args: &[Series]) -> PolarsResult { +pub(super) fn gather_every(args: &[Column]) -> PolarsResult { let ca = &args[0]; let n = &args[1].strict_cast(&IDX_DTYPE)?; let offset = &args[2].strict_cast(&IDX_DTYPE)?; - ca.list()?.lst_gather_every(n.idx()?, offset.idx()?) + ca.list()?.lst_gather_every(n.idx()?, offset.idx()?).map(Column::from) } #[cfg(feature = "list_count")] -pub(super) fn count_matches(args: &[Series]) -> PolarsResult { +pub(super) fn count_matches(args: &[Column]) -> PolarsResult { let s = &args[0]; let element = &args[1]; polars_ensure!( @@ -524,72 +531,72 @@ pub(super) fn count_matches(args: &[Series]) -> PolarsResult { element.len() ); let ca = s.list()?; - list_count_matches(ca, element.get(0).unwrap()) + list_count_matches(ca, element.get(0).unwrap()).map(Column::from) } -pub(super) fn sum(s: &Series) -> PolarsResult { - s.list()?.lst_sum() +pub(super) fn sum(s: &Column) -> PolarsResult { + s.list()?.lst_sum().map(Column::from) } -pub(super) fn length(s: &Series) -> PolarsResult { - Ok(s.list()?.lst_lengths().into_series()) +pub(super) fn length(s: &Column) -> PolarsResult { + Ok(s.list()?.lst_lengths().into_column()) } -pub(super) fn max(s: &Series) -> PolarsResult { - s.list()?.lst_max() +pub(super) fn max(s: &Column) -> PolarsResult { + s.list()?.lst_max().map(Column::from) } -pub(super) fn min(s: &Series) -> PolarsResult { - s.list()?.lst_min() +pub(super) fn min(s: &Column) -> PolarsResult { + s.list()?.lst_min().map(Column::from) } -pub(super) fn mean(s: &Series) -> PolarsResult { - Ok(s.list()?.lst_mean()) +pub(super) fn mean(s: &Column) -> PolarsResult { + Ok(s.list()?.lst_mean().into()) } -pub(super) fn median(s: &Series) -> PolarsResult { - Ok(s.list()?.lst_median()) +pub(super) fn median(s: &Column) -> PolarsResult { + Ok(s.list()?.lst_median().into()) } -pub(super) fn std(s: &Series, ddof: u8) -> PolarsResult { - Ok(s.list()?.lst_std(ddof)) +pub(super) fn std(s: &Column, ddof: u8) -> PolarsResult { + Ok(s.list()?.lst_std(ddof).into()) } -pub(super) fn var(s: &Series, ddof: u8) -> PolarsResult { - Ok(s.list()?.lst_var(ddof)) +pub(super) fn var(s: &Column, ddof: u8) -> PolarsResult { + Ok(s.list()?.lst_var(ddof).into()) } -pub(super) fn arg_min(s: &Series) -> PolarsResult { - Ok(s.list()?.lst_arg_min().into_series()) +pub(super) fn arg_min(s: &Column) -> PolarsResult { + Ok(s.list()?.lst_arg_min().into_column()) } -pub(super) fn arg_max(s: &Series) -> PolarsResult { - Ok(s.list()?.lst_arg_max().into_series()) +pub(super) fn arg_max(s: &Column) -> PolarsResult { + Ok(s.list()?.lst_arg_max().into_column()) } #[cfg(feature = "diff")] -pub(super) fn diff(s: &Series, n: i64, null_behavior: NullBehavior) -> PolarsResult { - Ok(s.list()?.lst_diff(n, null_behavior)?.into_series()) +pub(super) fn diff(s: &Column, n: i64, null_behavior: NullBehavior) -> PolarsResult { + Ok(s.list()?.lst_diff(n, null_behavior)?.into_column()) } -pub(super) fn sort(s: &Series, options: SortOptions) -> PolarsResult { - Ok(s.list()?.lst_sort(options)?.into_series()) +pub(super) fn sort(s: &Column, options: SortOptions) -> PolarsResult { + Ok(s.list()?.lst_sort(options)?.into_column()) } -pub(super) fn reverse(s: &Series) -> PolarsResult { - Ok(s.list()?.lst_reverse().into_series()) +pub(super) fn reverse(s: &Column) -> PolarsResult { + Ok(s.list()?.lst_reverse().into_column()) } -pub(super) fn unique(s: &Series, is_stable: bool) -> PolarsResult { +pub(super) fn unique(s: &Column, is_stable: bool) -> PolarsResult { if is_stable { - Ok(s.list()?.lst_unique_stable()?.into_series()) + Ok(s.list()?.lst_unique_stable()?.into_column()) } else { - Ok(s.list()?.lst_unique()?.into_series()) + Ok(s.list()?.lst_unique()?.into_column()) } } #[cfg(feature = "list_sets")] -pub(super) fn set_operation(s: &[Series], set_type: SetOperation) -> PolarsResult { +pub(super) fn set_operation(s: &[Column], set_type: SetOperation) -> PolarsResult { let s0 = &s[0]; let s1 = &s[1]; @@ -613,31 +620,31 @@ pub(super) fn set_operation(s: &[Series], set_type: SetOperation) -> PolarsResul }; } - list_set_operation(s0.list()?, s1.list()?, set_type).map(|ca| ca.into_series()) + list_set_operation(s0.list()?, s1.list()?, set_type).map(|ca| ca.into_column()) } #[cfg(feature = "list_any_all")] -pub(super) fn lst_any(s: &Series) -> PolarsResult { - s.list()?.lst_any() +pub(super) fn lst_any(s: &Column) -> PolarsResult { + s.list()?.lst_any().map(Column::from) } #[cfg(feature = "list_any_all")] -pub(super) fn lst_all(s: &Series) -> PolarsResult { - s.list()?.lst_all() +pub(super) fn lst_all(s: &Column) -> PolarsResult { + s.list()?.lst_all().map(Column::from) } -pub(super) fn join(s: &[Series], ignore_nulls: bool) -> PolarsResult { +pub(super) fn join(s: &[Column], ignore_nulls: bool) -> PolarsResult { let ca = s[0].list()?; let separator = s[1].str()?; - Ok(ca.lst_join(separator, ignore_nulls)?.into_series()) + Ok(ca.lst_join(separator, ignore_nulls)?.into_column()) } #[cfg(feature = "dtype-array")] -pub(super) fn to_array(s: &Series, width: usize) -> PolarsResult { +pub(super) fn to_array(s: &Column, width: usize) -> PolarsResult { let array_dtype = map_list_dtype_to_array_dtype(s.dtype(), width)?; s.cast(&array_dtype) } -pub(super) fn n_unique(s: &Series) -> PolarsResult { - Ok(s.list()?.lst_n_unique()?.into_series()) +pub(super) fn n_unique(s: &Column) -> PolarsResult { + Ok(s.list()?.lst_n_unique()?.into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/log.rs b/crates/polars-plan/src/dsl/function_expr/log.rs index 42c71c681f33..23b6b1e970b1 100644 --- a/crates/polars-plan/src/dsl/function_expr/log.rs +++ b/crates/polars-plan/src/dsl/function_expr/log.rs @@ -1,23 +1,23 @@ use super::*; -pub(super) fn entropy(s: &Series, base: f64, normalize: bool) -> PolarsResult { - let out = s.entropy(base, normalize)?; +pub(super) fn entropy(s: &Column, base: f64, normalize: bool) -> PolarsResult { + let out = s.as_materialized_series().entropy(base, normalize)?; if matches!(s.dtype(), DataType::Float32) { let out = out as f32; - Ok(Series::new(s.name().clone(), [out])) + Ok(Column::new(s.name().clone(), [out])) } else { - Ok(Series::new(s.name().clone(), [out])) + Ok(Column::new(s.name().clone(), [out])) } } -pub(super) fn log(s: &Series, base: f64) -> PolarsResult { - Ok(s.log(base)) +pub(super) fn log(s: &Column, base: f64) -> PolarsResult { + Ok(s.as_materialized_series().log(base).into()) } -pub(super) fn log1p(s: &Series) -> PolarsResult { - Ok(s.log1p()) +pub(super) fn log1p(s: &Column) -> PolarsResult { + Ok(s.as_materialized_series().log1p().into()) } -pub(super) fn exp(s: &Series) -> PolarsResult { - Ok(s.exp()) +pub(super) fn exp(s: &Column) -> PolarsResult { + Ok(s.as_materialized_series().exp().into()) } diff --git a/crates/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs index a40c25df764c..0f66344a2cba 100644 --- a/crates/polars-plan/src/dsl/function_expr/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/mod.rs @@ -776,7 +776,7 @@ macro_rules! wrap { }; ($e:expr, $($args:expr),*) => {{ - let f = move |s: &mut [Series]| { + let f = move |s: &mut [Column]| { $e(s, $($args),*) }; @@ -784,13 +784,13 @@ macro_rules! wrap { }}; } -// Fn(&[Series], args) +// Fn(&[Column], args) // all expression arguments are in the slice. // the first element is the root expression. #[macro_export] macro_rules! map_as_slice { ($func:path) => {{ - let f = move |s: &mut [Series]| { + let f = move |s: &mut [Column]| { $func(s).map(Some) }; @@ -798,7 +798,7 @@ macro_rules! map_as_slice { }}; ($func:path, $($args:expr),*) => {{ - let f = move |s: &mut [Series]| { + let f = move |s: &mut [Column]| { $func(s, $($args),*).map(Some) }; @@ -811,18 +811,18 @@ macro_rules! map_as_slice { #[macro_export] macro_rules! map_owned { ($func:path) => {{ - let f = move |s: &mut [Series]| { - let s = std::mem::take(&mut s[0]); - $func(s).map(Some) + let f = move |c: &mut [Column]| { + let c = std::mem::take(&mut c[0]); + $func(c).map(Some) }; SpecialEq::new(Arc::new(f)) }}; ($func:path, $($args:expr),*) => {{ - let f = move |s: &mut [Series]| { - let s = std::mem::take(&mut s[0]); - $func(s, $($args),*).map(Some) + let f = move |c: &mut [Column]| { + let c = std::mem::take(&mut c[0]); + $func(c, $($args),*).map(Some) }; SpecialEq::new(Arc::new(f)) @@ -833,25 +833,25 @@ macro_rules! map_owned { #[macro_export] macro_rules! map { ($func:path) => {{ - let f = move |s: &mut [Series]| { - let s = &s[0]; - $func(s).map(Some) + let f = move |c: &mut [Column]| { + let c = &c[0]; + $func(c).map(Some) }; SpecialEq::new(Arc::new(f)) }}; ($func:path, $($args:expr),*) => {{ - let f = move |s: &mut [Series]| { - let s = &s[0]; - $func(s, $($args),*).map(Some) + let f = move |c: &mut [Column]| { + let c = &c[0]; + $func(c, $($args),*).map(Some) }; SpecialEq::new(Arc::new(f)) }}; } -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: FunctionExpr) -> Self { use FunctionExpr::*; match func { @@ -877,9 +877,9 @@ impl From for SpecialEq> { Abs => map!(abs::abs), Negate => map!(dispatch::negate), NullCount => { - let f = |s: &mut [Series]| { + let f = |s: &mut [Column]| { let s = &s[0]; - Ok(Some(Series::new( + Ok(Some(Column::new( s.name().clone(), [s.null_count() as IdxSize], ))) diff --git a/crates/polars-plan/src/dsl/function_expr/nan.rs b/crates/polars-plan/src/dsl/function_expr/nan.rs index 45b6274dfc35..035556336a65 100644 --- a/crates/polars-plan/src/dsl/function_expr/nan.rs +++ b/crates/polars-plan/src/dsl/function_expr/nan.rs @@ -1,16 +1,16 @@ use super::*; -pub(super) fn drop_nans(s: Series) -> PolarsResult { +pub(super) fn drop_nans(s: Column) -> PolarsResult { match s.dtype() { DataType::Float32 => { let ca = s.f32()?; let mask = ca.is_not_nan() | ca.is_null(); - ca.filter(&mask).map(|ca| ca.into_series()) + ca.filter(&mask).map(|ca| ca.into_column()) }, DataType::Float64 => { let ca = s.f64()?; let mask = ca.is_not_nan() | ca.is_null(); - ca.filter(&mask).map(|ca| ca.into_series()) + ca.filter(&mask).map(|ca| ca.into_column()) }, _ => Ok(s), } diff --git a/crates/polars-plan/src/dsl/function_expr/peaks.rs b/crates/polars-plan/src/dsl/function_expr/peaks.rs index bd3ce01b975c..702a9dc3c86d 100644 --- a/crates/polars-plan/src/dsl/function_expr/peaks.rs +++ b/crates/polars-plan/src/dsl/function_expr/peaks.rs @@ -3,32 +3,34 @@ use polars_ops::chunked_array::peaks::{peak_max as pmax, peak_min as pmin}; use super::*; -pub(super) fn peak_min(s: &Series) -> PolarsResult { +pub(super) fn peak_min(s: &Column) -> PolarsResult { let s = s.to_physical_repr(); + let s = s.as_materialized_series(); let s = match s.dtype() { DataType::Boolean => polars_bail!(opq = peak_min, DataType::Boolean), #[cfg(feature = "dtype-decimal")] - DataType::Decimal(_, _) => pmin(s.decimal()?).into_series(), + DataType::Decimal(_, _) => pmin(s.decimal()?).into_column(), dt => { with_match_physical_numeric_polars_type!(dt, |$T| { let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref(); - pmin(ca).into_series() + pmin(ca).into_column() }) }, }; Ok(s) } -pub(super) fn peak_max(s: &Series) -> PolarsResult { +pub(super) fn peak_max(s: &Column) -> PolarsResult { let s = s.to_physical_repr(); + let s = s.as_materialized_series(); let s = match s.dtype() { DataType::Boolean => polars_bail!(opq = peak_max, DataType::Boolean), #[cfg(feature = "dtype-decimal")] - DataType::Decimal(_, _) => pmax(s.decimal()?).into_series(), + DataType::Decimal(_, _) => pmax(s.decimal()?).into_column(), dt => { with_match_physical_numeric_polars_type!(dt, |$T| { let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref(); - pmax(ca).into_series() + pmax(ca).into_column() }) }, }; diff --git a/crates/polars-plan/src/dsl/function_expr/plugin.rs b/crates/polars-plan/src/dsl/function_expr/plugin.rs index 5ce4875fe68d..f11cca8863ec 100644 --- a/crates/polars-plan/src/dsl/function_expr/plugin.rs +++ b/crates/polars-plan/src/dsl/function_expr/plugin.rs @@ -48,11 +48,11 @@ unsafe fn retrieve_error_msg(lib: &Library) -> &CStr { } pub(super) unsafe fn call_plugin( - s: &[Series], + s: &[Column], lib: &str, symbol: &str, kwargs: &[u8], -) -> PolarsResult { +) -> PolarsResult { let plugin = get_lib(lib)?; let lib = &plugin.0; let major = plugin.1; @@ -78,7 +78,8 @@ pub(super) unsafe fn call_plugin( .get(format!("_polars_plugin_{}", symbol).as_bytes()) .unwrap(); - let input = s.iter().map(export_series).collect::>(); + // @scalar-correctness? + let input = s.iter().map(export_column).collect::>(); let input_len = s.len(); let slice_ptr = input.as_ptr(); @@ -104,7 +105,7 @@ pub(super) unsafe fn call_plugin( } if !return_value.is_null() { - import_series(return_value) + import_series(return_value).map(Column::from) } else { let msg = retrieve_error_msg(lib); let msg = msg.to_string_lossy(); diff --git a/crates/polars-plan/src/dsl/function_expr/pow.rs b/crates/polars-plan/src/dsl/function_expr/pow.rs index 5336220d1ace..44394e9ae10a 100644 --- a/crates/polars-plan/src/dsl/function_expr/pow.rs +++ b/crates/polars-plan/src/dsl/function_expr/pow.rs @@ -29,12 +29,12 @@ impl Display for PowFunction { fn pow_on_chunked_arrays( base: &ChunkedArray, exponent: &ChunkedArray, -) -> PolarsResult> +) -> PolarsResult> where T: PolarsNumericType, F: PolarsNumericType, T::Native: num::pow::Pow + ToPrimitive, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { if (base.len() == 1) && (exponent.len() != 1) { let name = base.name(); @@ -44,13 +44,13 @@ where Ok(Some( unary_elementwise_values(exponent, |exp| Pow::pow(base, exp)) - .into_series() + .into_column() .with_name(name.clone()), )) } else { Ok(Some( polars_core::chunked_array::ops::arity::binary(base, exponent, pow_kernel) - .into_series(), + .into_column(), )) } } @@ -58,38 +58,38 @@ where fn pow_on_floats( base: &ChunkedArray, exponent: &ChunkedArray, -) -> PolarsResult> +) -> PolarsResult> where T: PolarsFloatType, T::Native: num::pow::Pow + ToPrimitive + Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { let dtype = T::get_dtype(); if exponent.len() == 1 { let Some(exponent_value) = exponent.get(0) else { - return Ok(Some(Series::full_null( + return Ok(Some(Column::full_null( base.name().clone(), base.len(), &dtype, ))); }; let s = match exponent_value.to_f64().unwrap() { - a if a == 1.0 => base.clone().into_series(), + a if a == 1.0 => base.clone().into_column(), // specialized sqrt will ensure (-inf)^0.5 = NaN // and will likely be faster as well. - a if a == 0.5 => base.apply_values(|v| v.sqrt()).into_series(), + a if a == 0.5 => base.apply_values(|v| v.sqrt()).into_column(), a if a.fract() == 0.0 && a < 10.0 && a > 1.0 => { let mut out = base.clone(); for _ in 1..exponent_value.to_u8().unwrap() { out = out * base.clone() } - out.into_series() + out.into_column() }, _ => base .apply_values(|v| Pow::pow(v, exponent_value)) - .into_series(), + .into_column(), }; Ok(Some(s)) } else { @@ -100,36 +100,36 @@ where fn pow_to_uint_dtype( base: &ChunkedArray, exponent: &ChunkedArray, -) -> PolarsResult> +) -> PolarsResult> where T: PolarsIntegerType, F: PolarsIntegerType, T::Native: num::pow::Pow + ToPrimitive, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { let dtype = T::get_dtype(); if exponent.len() == 1 { let Some(exponent_value) = exponent.get(0) else { - return Ok(Some(Series::full_null( + return Ok(Some(Column::full_null( base.name().clone(), base.len(), &dtype, ))); }; let s = match exponent_value.to_u64().unwrap() { - 1 => base.clone().into_series(), + 1 => base.clone().into_column(), 2..=10 => { let mut out = base.clone(); for _ in 1..exponent_value.to_u8().unwrap() { out = out * base.clone() } - out.into_series() + out.into_column() }, _ => base .apply_values(|v| Pow::pow(v, exponent_value)) - .into_series(), + .into_column(), }; Ok(Some(s)) } else { @@ -137,7 +137,7 @@ where } } -fn pow_on_series(base: &Series, exponent: &Series) -> PolarsResult> { +fn pow_on_series(base: &Column, exponent: &Column) -> PolarsResult> { use DataType::*; let base_dtype = base.dtype(); @@ -193,7 +193,7 @@ fn pow_on_series(base: &Series, exponent: &Series) -> PolarsResult PolarsResult> { +pub(super) fn pow(s: &mut [Column]) -> PolarsResult> { let base = &s[0]; let exponent = &s[1]; @@ -210,7 +210,7 @@ pub(super) fn pow(s: &mut [Series]) -> PolarsResult> { } } -pub(super) fn sqrt(base: &Series) -> PolarsResult { +pub(super) fn sqrt(base: &Column) -> PolarsResult { use DataType::*; match base.dtype() { Float32 => { @@ -228,16 +228,16 @@ pub(super) fn sqrt(base: &Series) -> PolarsResult { } } -fn sqrt_on_floats(base: &ChunkedArray) -> PolarsResult +fn sqrt_on_floats(base: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: num::pow::Pow + ToPrimitive + Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(base.apply_values(|v| v.sqrt()).into_series()) + Ok(base.apply_values(|v| v.sqrt()).into_column()) } -pub(super) fn cbrt(base: &Series) -> PolarsResult { +pub(super) fn cbrt(base: &Column) -> PolarsResult { use DataType::*; match base.dtype() { Float32 => { @@ -255,11 +255,11 @@ pub(super) fn cbrt(base: &Series) -> PolarsResult { } } -fn cbrt_on_floats(base: &ChunkedArray) -> PolarsResult +fn cbrt_on_floats(base: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: num::pow::Pow + ToPrimitive + Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(base.apply_values(|v| v.cbrt()).into_series()) + Ok(base.apply_values(|v| v.cbrt()).into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/random.rs b/crates/polars-plan/src/dsl/function_expr/random.rs index cb21e08367aa..91b0decc1c71 100644 --- a/crates/polars-plan/src/dsl/function_expr/random.rs +++ b/crates/polars-plan/src/dsl/function_expr/random.rs @@ -23,16 +23,16 @@ impl Hash for RandomMethod { } } -pub(super) fn shuffle(s: &Series, seed: Option) -> PolarsResult { +pub(super) fn shuffle(s: &Column, seed: Option) -> PolarsResult { Ok(s.shuffle(seed)) } pub(super) fn sample_frac( - s: &[Series], + s: &[Column], with_replacement: bool, shuffle: bool, seed: Option, -) -> PolarsResult { +) -> PolarsResult { let src = &s[0]; let frac_s = &s[1]; @@ -46,16 +46,16 @@ pub(super) fn sample_frac( match frac.get(0) { Some(frac) => src.sample_frac(frac, with_replacement, shuffle, seed), - None => Ok(Series::new_empty(src.name().clone(), src.dtype())), + None => Ok(Column::new_empty(src.name().clone(), src.dtype())), } } pub(super) fn sample_n( - s: &[Series], + s: &[Column], with_replacement: bool, shuffle: bool, seed: Option, -) -> PolarsResult { +) -> PolarsResult { let src = &s[0]; let n_s = &s[1]; @@ -69,6 +69,6 @@ pub(super) fn sample_n( match n.get(0) { Some(n) => src.sample_n(n as usize, with_replacement, shuffle, seed), - None => Ok(Series::new_empty(src.name().clone(), src.dtype())), + None => Ok(Column::new_empty(src.name().clone(), src.dtype())), } } diff --git a/crates/polars-plan/src/dsl/function_expr/range/date_range.rs b/crates/polars-plan/src/dsl/function_expr/range/date_range.rs index 5518d32df275..116d626a923f 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/date_range.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/date_range.rs @@ -10,10 +10,10 @@ use super::utils::{ const CAPACITY_FACTOR: usize = 5; pub(super) fn date_range( - s: &[Series], + s: &[Column], interval: Duration, closed: ClosedWindow, -) -> PolarsResult { +) -> PolarsResult { let start = &s[0]; let end = &s[1]; @@ -44,14 +44,14 @@ pub(super) fn date_range( )?; let to_type = DataType::Date; - out.cast(&to_type) + out.cast(&to_type).map(Column::from) } pub(super) fn date_ranges( - s: &[Series], + s: &[Column], interval: Duration, closed: ClosedWindow, -) -> PolarsResult { +) -> PolarsResult { let start = &s[0]; let end = &s[1]; diff --git a/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs b/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs index 394889dd34f1..a61264ce7aca 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/datetime_range.rs @@ -12,12 +12,12 @@ use crate::dsl::function_expr::FieldsMapper; const CAPACITY_FACTOR: usize = 5; pub(super) fn datetime_range( - s: &[Series], + s: &[Column], interval: Duration, closed: ClosedWindow, time_unit: Option, time_zone: Option, -) -> PolarsResult { +) -> PolarsResult { let mut start = s[0].clone(); let mut end = s[1].clone(); @@ -69,7 +69,7 @@ pub(super) fn datetime_range( NonExistent::Raise, )? .cast(&dtype)? - .into_series(), + .into_column(), polars_ops::prelude::replace_time_zone( end.datetime().unwrap(), Some(&tz), @@ -77,7 +77,7 @@ pub(super) fn datetime_range( NonExistent::Raise, )? .cast(&dtype)? - .into_series(), + .into_column(), ), _ => (start.cast(&dtype)?, end.cast(&dtype)?), }; @@ -99,16 +99,16 @@ pub(super) fn datetime_range( }, _ => unimplemented!(), }; - Ok(result.cast(&dtype).unwrap().into_series()) + Ok(result.cast(&dtype).unwrap().into_column()) } pub(super) fn datetime_ranges( - s: &[Series], + s: &[Column], interval: Duration, closed: ClosedWindow, time_unit: Option, time_zone: Option, -) -> PolarsResult { +) -> PolarsResult { let mut start = s[0].clone(); let mut end = s[1].clone(); @@ -158,7 +158,7 @@ pub(super) fn datetime_ranges( NonExistent::Raise, )? .cast(&dtype)? - .into_series() + .into_column() .to_physical_repr() .cast(&DataType::Int64)?, polars_ops::prelude::replace_time_zone( @@ -168,7 +168,7 @@ pub(super) fn datetime_ranges( NonExistent::Raise, )? .cast(&dtype)? - .into_series() + .into_column() .to_physical_repr() .cast(&DataType::Int64)?, ), @@ -220,7 +220,7 @@ pub(super) fn datetime_ranges( }; let to_type = DataType::List(Box::new(dtype)); - out.cast(&to_type) + out.cast(&to_type).map(Column::from) } impl<'a> FieldsMapper<'a> { diff --git a/crates/polars-plan/src/dsl/function_expr/range/int_range.rs b/crates/polars-plan/src/dsl/function_expr/range/int_range.rs index f1ae0ffe13a7..f9b524cfe481 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/int_range.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/int_range.rs @@ -6,7 +6,7 @@ use super::utils::{ensure_range_bounds_contain_exactly_one_value, numeric_ranges const CAPACITY_FACTOR: usize = 5; -pub(super) fn int_range(s: &[Series], step: i64, dtype: DataType) -> PolarsResult { +pub(super) fn int_range(s: &[Column], step: i64, dtype: DataType) -> PolarsResult { let mut start = &s[0]; let mut end = &s[1]; let name = start.name(); @@ -27,22 +27,22 @@ pub(super) fn int_range(s: &[Series], step: i64, dtype: DataType) -> PolarsResul with_match_physical_integer_polars_type!(dtype, |$T| { let start_v = get_first_series_value::<$T>(start)?; let end_v = get_first_series_value::<$T>(end)?; - new_int_range::<$T>(start_v, end_v, step, name.clone()) + new_int_range::<$T>(start_v, end_v, step, name.clone()).map(Column::from) }) } -fn get_first_series_value(s: &Series) -> PolarsResult +fn get_first_series_value(s: &Column) -> PolarsResult where T: PolarsIntegerType, { - let ca: &ChunkedArray = s.as_any().downcast_ref().unwrap(); + let ca: &ChunkedArray = s.as_materialized_series().as_any().downcast_ref().unwrap(); let value_opt = ca.get(0); let value = value_opt.ok_or_else(|| polars_err!(ComputeError: "invalid null input for `int_range`"))?; Ok(value) } -pub(super) fn int_ranges(s: &[Series]) -> PolarsResult { +pub(super) fn int_ranges(s: &[Column]) -> PolarsResult { let start = &s[0]; let end = &s[1]; let step = &s[2]; diff --git a/crates/polars-plan/src/dsl/function_expr/range/mod.rs b/crates/polars-plan/src/dsl/function_expr/range/mod.rs index 3350f0c6f8f5..61eebd9cf4b1 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/mod.rs @@ -18,7 +18,7 @@ use serde::{Deserialize, Serialize}; use crate::dsl::function_expr::FieldsMapper; use crate::dsl::SpecialEq; use crate::map_as_slice; -use crate::prelude::SeriesUdf; +use crate::prelude::ColumnsUdf; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Debug, Eq, Hash)] @@ -129,7 +129,7 @@ impl Display for RangeFunction { } } -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: RangeFunction) -> Self { use RangeFunction::*; match func { diff --git a/crates/polars-plan/src/dsl/function_expr/range/time_range.rs b/crates/polars-plan/src/dsl/function_expr/range/time_range.rs index 52211e89bc56..e339105bee3f 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/time_range.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/time_range.rs @@ -9,10 +9,10 @@ use super::utils::{ const CAPACITY_FACTOR: usize = 5; pub(super) fn time_range( - s: &[Series], + s: &[Column], interval: Duration, closed: ClosedWindow, -) -> PolarsResult { +) -> PolarsResult { let start = &s[0]; let end = &s[1]; let name = start.name(); @@ -26,14 +26,14 @@ pub(super) fn time_range( .ok_or_else(|| polars_err!(ComputeError: "end is an out-of-range time."))?; let out = time_range_impl(name.clone(), start, end, interval, closed)?; - Ok(out.cast(&dtype).unwrap().into_series()) + Ok(out.cast(&dtype).unwrap().into_column()) } pub(super) fn time_ranges( - s: &[Series], + s: &[Column], interval: Duration, closed: ClosedWindow, -) -> PolarsResult { +) -> PolarsResult { let start = &s[0]; let end = &s[1]; @@ -62,5 +62,5 @@ pub(super) fn time_ranges( let out = temporal_ranges_impl_broadcast(start, end, range_impl, &mut builder)?; let to_type = DataType::List(Box::new(DataType::Time)); - out.cast(&to_type) + out.cast(&to_type).map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/range/utils.rs b/crates/polars-plan/src/dsl/function_expr/range/utils.rs index b748daf0879a..232a9d1bb37e 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/utils.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/utils.rs @@ -1,14 +1,14 @@ use polars_core::prelude::{ - polars_bail, polars_ensure, ChunkedArray, Int64Chunked, IntoSeries, ListBuilderTrait, + polars_bail, polars_ensure, ChunkedArray, Column, Int64Chunked, IntoColumn, ListBuilderTrait, ListPrimitiveChunkedBuilder, PolarsIntegerType, PolarsResult, Series, }; -pub(super) fn temporal_series_to_i64_scalar(s: &Series) -> Option { +pub(super) fn temporal_series_to_i64_scalar(s: &Column) -> Option { s.to_physical_repr().get(0).unwrap().extract::() } pub(super) fn ensure_range_bounds_contain_exactly_one_value( - start: &Series, - end: &Series, + start: &Column, + end: &Column, ) -> PolarsResult<()> { polars_ensure!( start.len() == 1, @@ -28,7 +28,7 @@ pub(super) fn numeric_ranges_impl_broadcast( step: &Int64Chunked, range_impl: F, builder: &mut ListPrimitiveChunkedBuilder, -) -> PolarsResult +) -> PolarsResult where T: PolarsIntegerType, U: PolarsIntegerType, @@ -133,7 +133,7 @@ where ) }, }; - let out = builder.finish().into_series(); + let out = builder.finish().into_column(); Ok(out) } @@ -143,7 +143,7 @@ pub(super) fn temporal_ranges_impl_broadcast( end: &ChunkedArray, range_impl: F, builder: &mut ListPrimitiveChunkedBuilder, -) -> PolarsResult +) -> PolarsResult where T: PolarsIntegerType, U: PolarsIntegerType, @@ -190,7 +190,7 @@ where ) }, }; - let out = builder.finish().into_series(); + let out = builder.finish().into_column(); Ok(out) } diff --git a/crates/polars-plan/src/dsl/function_expr/rolling.rs b/crates/polars-plan/src/dsl/function_expr/rolling.rs index 9302ab4a1ad7..c108c92b571a 100644 --- a/crates/polars-plan/src/dsl/function_expr/rolling.rs +++ b/crates/polars-plan/src/dsl/function_expr/rolling.rs @@ -52,38 +52,62 @@ impl Hash for RollingFunction { } } -pub(super) fn rolling_min(s: &Series, options: RollingOptionsFixedWindow) -> PolarsResult { - s.rolling_min(options) +pub(super) fn rolling_min(s: &Column, options: RollingOptionsFixedWindow) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .rolling_min(options) + .map(Column::from) } -pub(super) fn rolling_max(s: &Series, options: RollingOptionsFixedWindow) -> PolarsResult { - s.rolling_max(options) +pub(super) fn rolling_max(s: &Column, options: RollingOptionsFixedWindow) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .rolling_max(options) + .map(Column::from) } -pub(super) fn rolling_mean(s: &Series, options: RollingOptionsFixedWindow) -> PolarsResult { - s.rolling_mean(options) +pub(super) fn rolling_mean(s: &Column, options: RollingOptionsFixedWindow) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .rolling_mean(options) + .map(Column::from) } -pub(super) fn rolling_sum(s: &Series, options: RollingOptionsFixedWindow) -> PolarsResult { - s.rolling_sum(options) +pub(super) fn rolling_sum(s: &Column, options: RollingOptionsFixedWindow) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .rolling_sum(options) + .map(Column::from) } pub(super) fn rolling_quantile( - s: &Series, + s: &Column, options: RollingOptionsFixedWindow, -) -> PolarsResult { - s.rolling_quantile(options) +) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .rolling_quantile(options) + .map(Column::from) } -pub(super) fn rolling_var(s: &Series, options: RollingOptionsFixedWindow) -> PolarsResult { - s.rolling_var(options) +pub(super) fn rolling_var(s: &Column, options: RollingOptionsFixedWindow) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .rolling_var(options) + .map(Column::from) } -pub(super) fn rolling_std(s: &Series, options: RollingOptionsFixedWindow) -> PolarsResult { - s.rolling_std(options) +pub(super) fn rolling_std(s: &Column, options: RollingOptionsFixedWindow) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .rolling_std(options) + .map(Column::from) } #[cfg(feature = "moment")] -pub(super) fn rolling_skew(s: &Series, window_size: usize, bias: bool) -> PolarsResult { - s.rolling_skew(window_size, bias) +pub(super) fn rolling_skew(s: &Column, window_size: usize, bias: bool) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .rolling_skew(window_size, bias) + .map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/rolling_by.rs b/crates/polars-plan/src/dsl/function_expr/rolling_by.rs index c2b3510281f2..dfc6e34e8c5a 100644 --- a/crates/polars-plan/src/dsl/function_expr/rolling_by.rs +++ b/crates/polars-plan/src/dsl/function_expr/rolling_by.rs @@ -39,50 +39,71 @@ impl Hash for RollingFunctionBy { } pub(super) fn rolling_min_by( - s: &[Series], + s: &[Column], options: RollingOptionsDynamicWindow, -) -> PolarsResult { - s[0].rolling_min_by(&s[1], options) +) -> PolarsResult { + // @scalar-opt + s[0].as_materialized_series() + .rolling_min_by(&s[1].as_materialized_series(), options) + .map(Column::from) } pub(super) fn rolling_max_by( - s: &[Series], + s: &[Column], options: RollingOptionsDynamicWindow, -) -> PolarsResult { - s[0].rolling_max_by(&s[1], options) +) -> PolarsResult { + // @scalar-opt + s[0].as_materialized_series() + .rolling_max_by(&s[1].as_materialized_series(), options) + .map(Column::from) } pub(super) fn rolling_mean_by( - s: &[Series], + s: &[Column], options: RollingOptionsDynamicWindow, -) -> PolarsResult { - s[0].rolling_mean_by(&s[1], options) +) -> PolarsResult { + // @scalar-opt + s[0].as_materialized_series() + .rolling_mean_by(&s[1].as_materialized_series(), options) + .map(Column::from) } pub(super) fn rolling_sum_by( - s: &[Series], + s: &[Column], options: RollingOptionsDynamicWindow, -) -> PolarsResult { - s[0].rolling_sum_by(&s[1], options) +) -> PolarsResult { + // @scalar-opt + s[0].as_materialized_series() + .rolling_sum_by(&s[1].as_materialized_series(), options) + .map(Column::from) } pub(super) fn rolling_quantile_by( - s: &[Series], + s: &[Column], options: RollingOptionsDynamicWindow, -) -> PolarsResult { - s[0].rolling_quantile_by(&s[1], options) +) -> PolarsResult { + // @scalar-opt + s[0].as_materialized_series() + .rolling_quantile_by(&s[1].as_materialized_series(), options) + .map(Column::from) } pub(super) fn rolling_var_by( - s: &[Series], + s: &[Column], options: RollingOptionsDynamicWindow, -) -> PolarsResult { - s[0].rolling_var_by(&s[1], options) +) -> PolarsResult { + // @scalar-opt + s[0].as_materialized_series() + .rolling_var_by(&s[1].as_materialized_series(), options) + .map(Column::from) } pub(super) fn rolling_std_by( - s: &[Series], + s: &[Column], options: RollingOptionsDynamicWindow, -) -> PolarsResult { - s[0].rolling_std_by(&s[1], options) +) -> PolarsResult { + // @scalar-opt + s[0].as_materialized_series() + .rolling_std_by(&s[1].as_materialized_series(), options) + .map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/round.rs b/crates/polars-plan/src/dsl/function_expr/round.rs index be7b25d00706..6858abce9b8c 100644 --- a/crates/polars-plan/src/dsl/function_expr/round.rs +++ b/crates/polars-plan/src/dsl/function_expr/round.rs @@ -1,17 +1,23 @@ use super::*; -pub(super) fn round(s: &Series, decimals: u32) -> PolarsResult { - s.round(decimals) +pub(super) fn round(s: &Column, decimals: u32) -> PolarsResult { + // @scalar-opt + s.as_materialized_series().round(decimals).map(Column::from) } -pub(super) fn round_sig_figs(s: &Series, digits: i32) -> PolarsResult { - s.round_sig_figs(digits) +pub(super) fn round_sig_figs(s: &Column, digits: i32) -> PolarsResult { + // @scalar-opt + s.as_materialized_series() + .round_sig_figs(digits) + .map(Column::from) } -pub(super) fn floor(s: &Series) -> PolarsResult { - s.floor() +pub(super) fn floor(s: &Column) -> PolarsResult { + // @scalar-opt + s.as_materialized_series().floor().map(Column::from) } -pub(super) fn ceil(s: &Series) -> PolarsResult { - s.ceil() +pub(super) fn ceil(s: &Column) -> PolarsResult { + // @scalar-opt + s.as_materialized_series().ceil().map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/row_hash.rs b/crates/polars-plan/src/dsl/function_expr/row_hash.rs index 3a2d33f08384..2039e86c6faa 100644 --- a/crates/polars-plan/src/dsl/function_expr/row_hash.rs +++ b/crates/polars-plan/src/dsl/function_expr/row_hash.rs @@ -1,6 +1,8 @@ use super::*; -pub(super) fn row_hash(s: &Series, k0: u64, k1: u64, k2: u64, k3: u64) -> PolarsResult { - Ok(s.hash(PlRandomState::with_seeds(k0, k1, k2, k3)) - .into_series()) +pub(super) fn row_hash(c: &Column, k0: u64, k1: u64, k2: u64, k3: u64) -> PolarsResult { + // @scalar-opt + Ok(c.as_materialized_series() + .hash(PlRandomState::with_seeds(k0, k1, k2, k3)) + .into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/search_sorted.rs b/crates/polars-plan/src/dsl/function_expr/search_sorted.rs index 87933fc7bd6c..38f6ef81d5c3 100644 --- a/crates/polars-plan/src/dsl/function_expr/search_sorted.rs +++ b/crates/polars-plan/src/dsl/function_expr/search_sorted.rs @@ -1,8 +1,14 @@ use super::*; -pub(super) fn search_sorted_impl(s: &mut [Series], side: SearchSortedSide) -> PolarsResult { +pub(super) fn search_sorted_impl(s: &mut [Column], side: SearchSortedSide) -> PolarsResult { let sorted_array = &s[0]; let search_value = &s[1]; - search_sorted(sorted_array, search_value, side, false).map(|ca| ca.into_series()) + search_sorted( + sorted_array.as_materialized_series(), + search_value.as_materialized_series(), + side, + false, + ) + .map(|ca| ca.into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs index 6ebc5f3d221e..a1ecc4a12d02 100644 --- a/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs +++ b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs @@ -16,7 +16,7 @@ where feature = "dtype-struct", feature = "dtype-categorical" ))] -fn shift_and_fill_with_mask(s: &Series, n: i64, fill_value: &Series) -> PolarsResult { +fn shift_and_fill_with_mask(s: &Column, n: i64, fill_value: &Column) -> PolarsResult { use polars_core::export::arrow::array::BooleanArray; use polars_core::export::arrow::bitmap::MutableBitmap; @@ -40,7 +40,7 @@ fn shift_and_fill_with_mask(s: &Series, n: i64, fill_value: &Series) -> PolarsRe s.shift(n).zip_with_same_type(&mask, fill_value) } -pub(super) fn shift_and_fill(args: &[Series]) -> PolarsResult { +pub(super) fn shift_and_fill(args: &[Column]) -> PolarsResult { let s = &args[0]; let n_s = &args[1]; @@ -66,7 +66,7 @@ pub(super) fn shift_and_fill(args: &[Series]) -> PolarsResult { AnyValue::Null => None, v => polars_bail!(ComputeError: "fill value '{}' is not supported", v), }; - ca.shift_and_fill(n, fill_value).into_series().cast(logical) + ca.shift_and_fill(n, fill_value).into_column().cast(logical) }, String => { let ca = s.str()?; @@ -75,7 +75,7 @@ pub(super) fn shift_and_fill(args: &[Series]) -> PolarsResult { AnyValue::Null => None, v => polars_bail!(ComputeError: "fill value '{}' is not supported", v), }; - ca.shift_and_fill(n, fill_value).into_series().cast(logical) + ca.shift_and_fill(n, fill_value).into_column().cast(logical) }, List(_) => { let ca = s.list()?; @@ -85,7 +85,7 @@ pub(super) fn shift_and_fill(args: &[Series]) -> PolarsResult { v => polars_bail!(ComputeError: "fill value '{}' is not supported", v), }; ca.shift_and_fill(n, fill_value.as_ref()) - .into_series() + .into_column() .cast(logical) }, #[cfg(feature = "object")] @@ -97,7 +97,7 @@ pub(super) fn shift_and_fill(args: &[Series]) -> PolarsResult { dt if dt.is_numeric() || dt.is_logical() => { macro_rules! dispatch { ($ca:expr, $n:expr, $fill_value:expr) => {{ - shift_and_fill_numeric($ca, $n, $fill_value).into_series() + shift_and_fill_numeric($ca, $n, $fill_value).into_column() }}; } let out = downcast_as_macro_arg_physical!(physical, dispatch, n, fill_value); @@ -106,11 +106,11 @@ pub(super) fn shift_and_fill(args: &[Series]) -> PolarsResult { dt => polars_bail!(opq = shift_and_fill, dt), } } else { - Ok(Series::full_null(s.name().clone(), s.len(), s.dtype())) + Ok(Column::full_null(s.name().clone(), s.len(), s.dtype())) } } -pub fn shift(args: &[Series]) -> PolarsResult { +pub fn shift(args: &[Column]) -> PolarsResult { let s = &args[0]; let n_s = &args[1]; polars_ensure!( @@ -123,6 +123,6 @@ pub fn shift(args: &[Series]) -> PolarsResult { match n.get(0) { Some(n) => Ok(s.shift(n)), - None => Ok(Series::full_null(s.name().clone(), s.len(), s.dtype())), + None => Ok(Column::full_null(s.name().clone(), s.len(), s.dtype())), } } diff --git a/crates/polars-plan/src/dsl/function_expr/shrink_type.rs b/crates/polars-plan/src/dsl/function_expr/shrink_type.rs index 0dee54f79e8f..8489adda82a8 100644 --- a/crates/polars-plan/src/dsl/function_expr/shrink_type.rs +++ b/crates/polars-plan/src/dsl/function_expr/shrink_type.rs @@ -1,6 +1,6 @@ use super::*; -pub(super) fn shrink(s: Series) -> PolarsResult { +pub(super) fn shrink(s: Column) -> PolarsResult { if !s.dtype().is_numeric() { return Ok(s); } @@ -9,6 +9,9 @@ pub(super) fn shrink(s: Series) -> PolarsResult { return s.cast(&DataType::Float32); } + // @scalar-opt + let s = s.as_materialized_series(); + if s.dtype().is_unsigned_integer() { let max = s.max_reduce()?.value().extract::().unwrap_or(0_u64); @@ -19,7 +22,7 @@ pub(super) fn shrink(s: Series) -> PolarsResult { } else if max <= u32::MAX as u64 { s.cast(&DataType::UInt32) } else { - Ok(s) + Ok(s.clone()) } } else { let min = s.min_reduce()?.value().extract::().unwrap_or(0_i64); @@ -32,7 +35,7 @@ pub(super) fn shrink(s: Series) -> PolarsResult { } else if min >= i32::MIN as i64 && max <= i32::MAX as i64 { s.cast(&DataType::Int32) } else { - Ok(s) + Ok(s.clone()) } - } + }.map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/sign.rs b/crates/polars-plan/src/dsl/function_expr/sign.rs index a7bf4d3277e6..471c76f8ad2d 100644 --- a/crates/polars-plan/src/dsl/function_expr/sign.rs +++ b/crates/polars-plan/src/dsl/function_expr/sign.rs @@ -4,7 +4,8 @@ use polars_core::with_match_physical_numeric_polars_type; use super::*; -pub(super) fn sign(s: &Series) -> PolarsResult { +pub(super) fn sign(s: &Column) -> PolarsResult { + let s = s.as_materialized_series(); let dt = s.dtype(); polars_ensure!(dt.is_numeric(), opq = sign, dt); with_match_physical_numeric_polars_type!(dt, |$T| { @@ -13,10 +14,10 @@ pub(super) fn sign(s: &Series) -> PolarsResult { }) } -fn sign_impl(ca: &ChunkedArray) -> Series +fn sign_impl(ca: &ChunkedArray) -> Column where T: PolarsNumericType, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { ca.apply_values(|x| { if x < T::Native::zero() { @@ -30,5 +31,5 @@ where x } }) - .into_series() + .into_column() } diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 9a5d2a9ff537..d3743463d308 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -290,7 +290,7 @@ impl Display for StringFunction { } } -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: StringFunction) -> Self { use StringFunction::*; match func { @@ -405,15 +405,15 @@ impl From for SpecialEq> { } #[cfg(feature = "find_many")] -fn contains_many(s: &[Series], ascii_case_insensitive: bool) -> PolarsResult { +fn contains_many(s: &[Column], ascii_case_insensitive: bool) -> PolarsResult { let ca = s[0].str()?; let patterns = s[1].str()?; polars_ops::chunked_array::strings::contains_any(ca, patterns, ascii_case_insensitive) - .map(|out| out.into_series()) + .map(|out| out.into_column()) } #[cfg(feature = "find_many")] -fn replace_many(s: &[Series], ascii_case_insensitive: bool) -> PolarsResult { +fn replace_many(s: &[Column], ascii_case_insensitive: bool) -> PolarsResult { let ca = s[0].str()?; let patterns = s[1].str()?; let replace_with = s[2].str()?; @@ -423,148 +423,148 @@ fn replace_many(s: &[Series], ascii_case_insensitive: bool) -> PolarsResult PolarsResult { +) -> PolarsResult { let ca = s[0].str()?; let patterns = &s[1]; polars_ops::chunked_array::strings::extract_many( ca, - patterns, + patterns.as_materialized_series(), ascii_case_insensitive, overlapping, ) - .map(|out| out.into_series()) + .map(|out| out.into_column()) } -fn uppercase(s: &Series) -> PolarsResult { +fn uppercase(s: &Column) -> PolarsResult { let ca = s.str()?; - Ok(ca.to_uppercase().into_series()) + Ok(ca.to_uppercase().into_column()) } -fn lowercase(s: &Series) -> PolarsResult { +fn lowercase(s: &Column) -> PolarsResult { let ca = s.str()?; - Ok(ca.to_lowercase().into_series()) + Ok(ca.to_lowercase().into_column()) } #[cfg(feature = "nightly")] -pub(super) fn titlecase(s: &Series) -> PolarsResult { +pub(super) fn titlecase(s: &Column) -> PolarsResult { let ca = s.str()?; - Ok(ca.to_titlecase().into_series()) + Ok(ca.to_titlecase().into_column()) } -pub(super) fn len_chars(s: &Series) -> PolarsResult { +pub(super) fn len_chars(s: &Column) -> PolarsResult { let ca = s.str()?; - Ok(ca.str_len_chars().into_series()) + Ok(ca.str_len_chars().into_column()) } -pub(super) fn len_bytes(s: &Series) -> PolarsResult { +pub(super) fn len_bytes(s: &Column) -> PolarsResult { let ca = s.str()?; - Ok(ca.str_len_bytes().into_series()) + Ok(ca.str_len_bytes().into_column()) } #[cfg(feature = "regex")] -pub(super) fn contains(s: &[Series], literal: bool, strict: bool) -> PolarsResult { +pub(super) fn contains(s: &[Column], literal: bool, strict: bool) -> PolarsResult { let ca = s[0].str()?; let pat = s[1].str()?; ca.contains_chunked(pat, literal, strict) - .map(|ok| ok.into_series()) + .map(|ok| ok.into_column()) } #[cfg(feature = "regex")] -pub(super) fn find(s: &[Series], literal: bool, strict: bool) -> PolarsResult { +pub(super) fn find(s: &[Column], literal: bool, strict: bool) -> PolarsResult { let ca = s[0].str()?; let pat = s[1].str()?; ca.find_chunked(pat, literal, strict) - .map(|ok| ok.into_series()) + .map(|ok| ok.into_column()) } -pub(super) fn ends_with(s: &[Series]) -> PolarsResult { +pub(super) fn ends_with(s: &[Column]) -> PolarsResult { let ca = &s[0].str()?.as_binary(); let suffix = &s[1].str()?.as_binary(); - Ok(ca.ends_with_chunked(suffix).into_series()) + Ok(ca.ends_with_chunked(suffix).into_column()) } -pub(super) fn starts_with(s: &[Series]) -> PolarsResult { +pub(super) fn starts_with(s: &[Column]) -> PolarsResult { let ca = &s[0].str()?.as_binary(); let prefix = &s[1].str()?.as_binary(); - Ok(ca.starts_with_chunked(prefix).into_series()) + Ok(ca.starts_with_chunked(prefix).into_column()) } /// Extract a regex pattern from the a string value. -pub(super) fn extract(s: &[Series], group_index: usize) -> PolarsResult { +pub(super) fn extract(s: &[Column], group_index: usize) -> PolarsResult { let ca = s[0].str()?; let pat = s[1].str()?; - ca.extract(pat, group_index).map(|ca| ca.into_series()) + ca.extract(pat, group_index).map(|ca| ca.into_column()) } #[cfg(feature = "extract_groups")] /// Extract all capture groups from a regex pattern as a struct -pub(super) fn extract_groups(s: &Series, pat: &str, dtype: &DataType) -> PolarsResult { +pub(super) fn extract_groups(s: &Column, pat: &str, dtype: &DataType) -> PolarsResult { let ca = s.str()?; - ca.extract_groups(pat, dtype) + ca.extract_groups(pat, dtype).map(Column::from) } #[cfg(feature = "string_pad")] -pub(super) fn pad_start(s: &Series, length: usize, fill_char: char) -> PolarsResult { +pub(super) fn pad_start(s: &Column, length: usize, fill_char: char) -> PolarsResult { let ca = s.str()?; - Ok(ca.pad_start(length, fill_char).into_series()) + Ok(ca.pad_start(length, fill_char).into_column()) } #[cfg(feature = "string_pad")] -pub(super) fn pad_end(s: &Series, length: usize, fill_char: char) -> PolarsResult { +pub(super) fn pad_end(s: &Column, length: usize, fill_char: char) -> PolarsResult { let ca = s.str()?; - Ok(ca.pad_end(length, fill_char).into_series()) + Ok(ca.pad_end(length, fill_char).into_column()) } #[cfg(feature = "string_pad")] -pub(super) fn zfill(s: &[Series]) -> PolarsResult { +pub(super) fn zfill(s: &[Column]) -> PolarsResult { let ca = s[0].str()?; let length_s = s[1].strict_cast(&DataType::UInt64)?; let length = length_s.u64()?; - Ok(ca.zfill(length).into_series()) + Ok(ca.zfill(length).into_column()) } -pub(super) fn strip_chars(s: &[Series]) -> PolarsResult { +pub(super) fn strip_chars(s: &[Column]) -> PolarsResult { let ca = s[0].str()?; let pat_s = &s[1]; - ca.strip_chars(pat_s).map(|ok| ok.into_series()) + ca.strip_chars(pat_s).map(|ok| ok.into_column()) } -pub(super) fn strip_chars_start(s: &[Series]) -> PolarsResult { +pub(super) fn strip_chars_start(s: &[Column]) -> PolarsResult { let ca = s[0].str()?; let pat_s = &s[1]; - ca.strip_chars_start(pat_s).map(|ok| ok.into_series()) + ca.strip_chars_start(pat_s).map(|ok| ok.into_column()) } -pub(super) fn strip_chars_end(s: &[Series]) -> PolarsResult { +pub(super) fn strip_chars_end(s: &[Column]) -> PolarsResult { let ca = s[0].str()?; let pat_s = &s[1]; - ca.strip_chars_end(pat_s).map(|ok| ok.into_series()) + ca.strip_chars_end(pat_s).map(|ok| ok.into_column()) } -pub(super) fn strip_prefix(s: &[Series]) -> PolarsResult { +pub(super) fn strip_prefix(s: &[Column]) -> PolarsResult { let ca = s[0].str()?; let prefix = s[1].str()?; - Ok(ca.strip_prefix(prefix).into_series()) + Ok(ca.strip_prefix(prefix).into_column()) } -pub(super) fn strip_suffix(s: &[Series]) -> PolarsResult { +pub(super) fn strip_suffix(s: &[Column]) -> PolarsResult { let ca = s[0].str()?; let suffix = s[1].str()?; - Ok(ca.strip_suffix(suffix).into_series()) + Ok(ca.strip_suffix(suffix).into_column()) } -pub(super) fn extract_all(args: &[Series]) -> PolarsResult { +pub(super) fn extract_all(args: &[Column]) -> PolarsResult { let s = &args[0]; let pat = &args[1]; @@ -573,20 +573,20 @@ pub(super) fn extract_all(args: &[Series]) -> PolarsResult { if pat.len() == 1 { if let Some(pat) = pat.get(0) { - ca.extract_all(pat).map(|ca| ca.into_series()) + ca.extract_all(pat).map(|ca| ca.into_column()) } else { - Ok(Series::full_null( + Ok(Column::full_null( ca.name().clone(), ca.len(), &DataType::List(Box::new(DataType::String)), )) } } else { - ca.extract_all_many(pat).map(|ca| ca.into_series()) + ca.extract_all_many(pat).map(|ca| ca.into_column()) } } -pub(super) fn count_matches(args: &[Series], literal: bool) -> PolarsResult { +pub(super) fn count_matches(args: &[Column], literal: bool) -> PolarsResult { let s = &args[0]; let pat = &args[1]; @@ -594,9 +594,9 @@ pub(super) fn count_matches(args: &[Series], literal: bool) -> PolarsResult PolarsResult PolarsResult { +) -> PolarsResult { match dtype { #[cfg(feature = "dtype-date")] DataType::Date => to_date(&s[0], options), @@ -628,62 +628,62 @@ pub(super) fn strptime( } #[cfg(feature = "dtype-struct")] -pub(super) fn split_exact(s: &[Series], n: usize, inclusive: bool) -> PolarsResult { +pub(super) fn split_exact(s: &[Column], n: usize, inclusive: bool) -> PolarsResult { let ca = s[0].str()?; let by = s[1].str()?; if inclusive { - ca.split_exact_inclusive(by, n).map(|ca| ca.into_series()) + ca.split_exact_inclusive(by, n).map(|ca| ca.into_column()) } else { - ca.split_exact(by, n).map(|ca| ca.into_series()) + ca.split_exact(by, n).map(|ca| ca.into_column()) } } #[cfg(feature = "dtype-struct")] -pub(super) fn splitn(s: &[Series], n: usize) -> PolarsResult { +pub(super) fn splitn(s: &[Column], n: usize) -> PolarsResult { let ca = s[0].str()?; let by = s[1].str()?; - ca.splitn(by, n).map(|ca| ca.into_series()) + ca.splitn(by, n).map(|ca| ca.into_column()) } -pub(super) fn split(s: &[Series], inclusive: bool) -> PolarsResult { +pub(super) fn split(s: &[Column], inclusive: bool) -> PolarsResult { let ca = s[0].str()?; let by = s[1].str()?; if inclusive { - Ok(ca.split_inclusive(by).into_series()) + Ok(ca.split_inclusive(by).into_column()) } else { - Ok(ca.split(by).into_series()) + Ok(ca.split(by).into_column()) } } #[cfg(feature = "dtype-date")] -fn to_date(s: &Series, options: &StrptimeOptions) -> PolarsResult { +fn to_date(s: &Column, options: &StrptimeOptions) -> PolarsResult { let ca = s.str()?; let out = { if options.exact { ca.as_date(options.format.as_deref(), options.cache)? - .into_series() + .into_column() } else { ca.as_date_not_exact(options.format.as_deref())? - .into_series() + .into_column() } }; if options.strict && ca.null_count() != out.null_count() { - handle_casting_failures(s, &out)?; + handle_casting_failures(s.as_materialized_series(), &out.as_materialized_series())?; } - Ok(out.into_series()) + Ok(out.into_column()) } #[cfg(feature = "dtype-datetime")] fn to_datetime( - s: &[Series], + s: &[Column], time_unit: &TimeUnit, time_zone: Option<&TimeZone>, options: &StrptimeOptions, -) -> PolarsResult { +) -> PolarsResult { let datetime_strings = &s[0].str()?; let ambiguous = &s[1].str()?; let tz_aware = match &options.format { @@ -705,7 +705,7 @@ fn to_datetime( time_zone, ambiguous, )? - .into_series() + .into_column() } else { datetime_strings .as_datetime_not_exact( @@ -715,17 +715,17 @@ fn to_datetime( time_zone, ambiguous, )? - .into_series() + .into_column() }; if options.strict && datetime_strings.null_count() != out.null_count() { - handle_casting_failures(&s[0], &out)?; + handle_casting_failures(&s[0].as_materialized_series(), &out.as_materialized_series())?; } - Ok(out.into_series()) + Ok(out.into_column()) } #[cfg(feature = "dtype-time")] -fn to_time(s: &Series, options: &StrptimeOptions) -> PolarsResult { +fn to_time(s: &Column, options: &StrptimeOptions) -> PolarsResult { polars_ensure!( options.exact, ComputeError: "non-exact not implemented for Time data type" ); @@ -733,33 +733,33 @@ fn to_time(s: &Series, options: &StrptimeOptions) -> PolarsResult { let ca = s.str()?; let out = ca .as_time(options.format.as_deref(), options.cache)? - .into_series(); + .into_column(); if options.strict && ca.null_count() != out.null_count() { - handle_casting_failures(s, &out)?; + handle_casting_failures(s.as_materialized_series(), &out.as_materialized_series())?; } - Ok(out.into_series()) + Ok(out.into_column()) } #[cfg(feature = "concat_str")] -pub(super) fn join(s: &Series, delimiter: &str, ignore_nulls: bool) -> PolarsResult { +pub(super) fn join(s: &Column, delimiter: &str, ignore_nulls: bool) -> PolarsResult { let str_s = s.cast(&DataType::String)?; let joined = polars_ops::chunked_array::str_join(str_s.str()?, delimiter, ignore_nulls); - Ok(joined.into_series()) + Ok(joined.into_column()) } #[cfg(feature = "concat_str")] pub(super) fn concat_hor( - series: &[Series], + series: &[Column], delimiter: &str, ignore_nulls: bool, -) -> PolarsResult { +) -> PolarsResult { let str_series: Vec<_> = series .iter() .map(|s| s.cast(&DataType::String)) .collect::>()?; let cas: Vec<_> = str_series.iter().map(|s| s.str().unwrap()).collect(); - Ok(polars_ops::chunked_array::hor_str_concat(&cas, delimiter, ignore_nulls)?.into_series()) + Ok(polars_ops::chunked_array::hor_str_concat(&cas, delimiter, ignore_nulls)?.into_column()) } impl From for FunctionExpr { @@ -906,7 +906,7 @@ fn replace_all<'a>( } #[cfg(feature = "regex")] -pub(super) fn replace(s: &[Series], literal: bool, n: i64) -> PolarsResult { +pub(super) fn replace(s: &[Column], literal: bool, n: i64) -> PolarsResult { let column = &s[0]; let pat = &s[1]; let val = &s[2]; @@ -921,24 +921,24 @@ pub(super) fn replace(s: &[Series], literal: bool, n: i64) -> PolarsResult PolarsResult { +pub(super) fn reverse(s: &Column) -> PolarsResult { let ca = s.str()?; - Ok(ca.str_reverse().into_series()) + Ok(ca.str_reverse().into_column()) } #[cfg(feature = "string_to_integer")] -pub(super) fn to_integer(s: &[Series], strict: bool) -> PolarsResult { +pub(super) fn to_integer(s: &[Column], strict: bool) -> PolarsResult { let ca = s[0].str()?; let base = s[1].strict_cast(&DataType::UInt32)?; ca.to_integer(base.u32()?, strict) - .map(|ok| ok.into_series()) + .map(|ok| ok.into_column()) } -fn _ensure_lengths(s: &[Series]) -> bool { +fn _ensure_lengths(s: &[Column]) -> bool { // Calculate the post-broadcast length and ensure everything is consistent. let len = s .iter() @@ -950,7 +950,7 @@ fn _ensure_lengths(s: &[Series]) -> bool { .all(|series| series.len() == 1 || series.len() == len) } -pub(super) fn str_slice(s: &[Series]) -> PolarsResult { +pub(super) fn str_slice(s: &[Column]) -> PolarsResult { polars_ensure!( _ensure_lengths(s), ComputeError: "all series in `str_slice` should have equal or unit length", @@ -958,68 +958,68 @@ pub(super) fn str_slice(s: &[Series]) -> PolarsResult { let ca = s[0].str()?; let offset = &s[1]; let length = &s[2]; - Ok(ca.str_slice(offset, length)?.into_series()) + Ok(ca.str_slice(offset, length)?.into_column()) } -pub(super) fn str_head(s: &[Series]) -> PolarsResult { +pub(super) fn str_head(s: &[Column]) -> PolarsResult { polars_ensure!( _ensure_lengths(s), ComputeError: "all series in `str_head` should have equal or unit length", ); let ca = s[0].str()?; let n = &s[1]; - Ok(ca.str_head(n)?.into_series()) + Ok(ca.str_head(n)?.into_column()) } -pub(super) fn str_tail(s: &[Series]) -> PolarsResult { +pub(super) fn str_tail(s: &[Column]) -> PolarsResult { polars_ensure!( _ensure_lengths(s), ComputeError: "all series in `str_tail` should have equal or unit length", ); let ca = s[0].str()?; let n = &s[1]; - Ok(ca.str_tail(n)?.into_series()) + Ok(ca.str_tail(n)?.into_column()) } #[cfg(feature = "string_encoding")] -pub(super) fn hex_encode(s: &Series) -> PolarsResult { - Ok(s.str()?.hex_encode().into_series()) +pub(super) fn hex_encode(s: &Column) -> PolarsResult { + Ok(s.str()?.hex_encode().into_column()) } #[cfg(feature = "binary_encoding")] -pub(super) fn hex_decode(s: &Series, strict: bool) -> PolarsResult { - s.str()?.hex_decode(strict).map(|ca| ca.into_series()) +pub(super) fn hex_decode(s: &Column, strict: bool) -> PolarsResult { + s.str()?.hex_decode(strict).map(|ca| ca.into_column()) } #[cfg(feature = "string_encoding")] -pub(super) fn base64_encode(s: &Series) -> PolarsResult { - Ok(s.str()?.base64_encode().into_series()) +pub(super) fn base64_encode(s: &Column) -> PolarsResult { + Ok(s.str()?.base64_encode().into_column()) } #[cfg(feature = "binary_encoding")] -pub(super) fn base64_decode(s: &Series, strict: bool) -> PolarsResult { - s.str()?.base64_decode(strict).map(|ca| ca.into_series()) +pub(super) fn base64_decode(s: &Column, strict: bool) -> PolarsResult { + s.str()?.base64_decode(strict).map(|ca| ca.into_column()) } #[cfg(feature = "dtype-decimal")] -pub(super) fn to_decimal(s: &Series, infer_len: usize) -> PolarsResult { +pub(super) fn to_decimal(s: &Column, infer_len: usize) -> PolarsResult { let ca = s.str()?; - ca.to_decimal(infer_len) + ca.to_decimal(infer_len).map(Column::from) } #[cfg(feature = "extract_jsonpath")] pub(super) fn json_decode( - s: &Series, + s: &Column, dtype: Option, infer_schema_len: Option, -) -> PolarsResult { +) -> PolarsResult { let ca = s.str()?; - ca.json_decode(dtype, infer_schema_len) + ca.json_decode(dtype, infer_schema_len).map(Column::from) } #[cfg(feature = "extract_jsonpath")] -pub(super) fn json_path_match(s: &[Series]) -> PolarsResult { +pub(super) fn json_path_match(s: &[Column]) -> PolarsResult { let ca = s[0].str()?; let pat = s[1].str()?; - Ok(ca.json_path_match(pat)?.into_series()) + Ok(ca.json_path_match(pat)?.into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/struct_.rs b/crates/polars-plan/src/dsl/function_expr/struct_.rs index 98753314c6e2..3c72138e6241 100644 --- a/crates/polars-plan/src/dsl/function_expr/struct_.rs +++ b/crates/polars-plan/src/dsl/function_expr/struct_.rs @@ -142,7 +142,7 @@ impl Display for StructFunction { } } -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: StructFunction) -> Self { use StructFunction::*; match func { @@ -159,12 +159,12 @@ impl From for SpecialEq> { } } -pub(super) fn get_by_name(s: &Series, name: &str) -> PolarsResult { +pub(super) fn get_by_name(s: &Column, name: &str) -> PolarsResult { let ca = s.struct_()?; - ca.field_by_name(name) + ca.field_by_name(name).map(Column::from) } -pub(super) fn rename_fields(s: &Series, names: Arc<[PlSmallStr]>) -> PolarsResult { +pub(super) fn rename_fields(s: &Column, names: Arc<[PlSmallStr]>) -> PolarsResult { let ca = s.struct_()?; let fields = ca .fields_as_series() @@ -178,10 +178,10 @@ pub(super) fn rename_fields(s: &Series, names: Arc<[PlSmallStr]>) -> PolarsResul .collect::>(); let mut out = StructChunked::from_series(ca.name().clone(), &fields)?; out.zip_outer_validity(ca); - Ok(out.into_series()) + Ok(out.into_column()) } -pub(super) fn prefix_fields(s: &Series, prefix: &str) -> PolarsResult { +pub(super) fn prefix_fields(s: &Column, prefix: &str) -> PolarsResult { let ca = s.struct_()?; let fields = ca .fields_as_series() @@ -195,10 +195,10 @@ pub(super) fn prefix_fields(s: &Series, prefix: &str) -> PolarsResult { .collect::>(); let mut out = StructChunked::from_series(ca.name().clone(), &fields)?; out.zip_outer_validity(ca); - Ok(out.into_series()) + Ok(out.into_column()) } -pub(super) fn suffix_fields(s: &Series, suffix: &str) -> PolarsResult { +pub(super) fn suffix_fields(s: &Column, suffix: &str) -> PolarsResult { let ca = s.struct_()?; let fields = ca .fields_as_series() @@ -212,11 +212,11 @@ pub(super) fn suffix_fields(s: &Series, suffix: &str) -> PolarsResult { .collect::>(); let mut out = StructChunked::from_series(ca.name().clone(), &fields)?; out.zip_outer_validity(ca); - Ok(out.into_series()) + Ok(out.into_column()) } #[cfg(feature = "json")] -pub(super) fn to_json(s: &Series) -> PolarsResult { +pub(super) fn to_json(s: &Column) -> PolarsResult { let ca = s.struct_()?; let dtype = ca.dtype().to_arrow(CompatLevel::newest()); @@ -225,10 +225,10 @@ pub(super) fn to_json(s: &Series) -> PolarsResult { polars_json::json::write::serialize_to_utf8(arr.as_ref()) }); - Ok(StringChunked::from_chunk_iter(ca.name().clone(), iter).into_series()) + Ok(StringChunked::from_chunk_iter(ca.name().clone(), iter).into_column()) } -pub(super) fn with_fields(args: &[Series]) -> PolarsResult { +pub(super) fn with_fields(args: &[Column]) -> PolarsResult { let s = &args[0]; let ca = s.struct_()?; @@ -241,11 +241,11 @@ pub(super) fn with_fields(args: &[Series]) -> PolarsResult { } for field in &args[1..] { - fields.insert(field.name(), field); + fields.insert(field.name(), field.as_materialized_series()); } let new_fields = fields.into_values().cloned().collect::>(); let mut out = StructChunked::from_series(ca.name().clone(), &new_fields)?; out.zip_outer_validity(ca); - Ok(out.into_series()) + Ok(out.into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/temporal.rs b/crates/polars-plan/src/dsl/function_expr/temporal.rs index 18340a00adaf..dcb5005ae4b1 100644 --- a/crates/polars-plan/src/dsl/function_expr/temporal.rs +++ b/crates/polars-plan/src/dsl/function_expr/temporal.rs @@ -1,7 +1,7 @@ use super::*; use crate::{map, map_as_slice}; -impl From for SpecialEq> { +impl From for SpecialEq> { fn from(func: TemporalFunction) -> Self { use TemporalFunction::*; match func { @@ -71,10 +71,10 @@ impl From for SpecialEq> { } pub(super) fn datetime( - s: &[Series], + s: &[Column], time_unit: &TimeUnit, time_zone: Option<&str>, -) -> PolarsResult { +) -> PolarsResult { use polars_core::export::chrono::NaiveDate; use polars_core::utils::CustomIterTools; @@ -177,12 +177,12 @@ pub(super) fn datetime( }, }; - let mut s = ca.into_series(); + let mut s = ca.into_column(); s.rename(PlSmallStr::from_static("datetime")); Ok(s) } -pub(super) fn combine(s: &[Series], tu: TimeUnit) -> PolarsResult { +pub(super) fn combine(s: &[Column], tu: TimeUnit) -> PolarsResult { let date = &s[0]; let time = &s[1]; @@ -207,7 +207,7 @@ pub(super) fn combine(s: &[Series], tu: TimeUnit) -> PolarsResult { &StringChunked::from_iter(std::iter::once("raise")), NonExistent::Raise, )? - .into()), + .into_column()), _ => result_naive, } } diff --git a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs index 45c989e16dcb..0e7080b0461b 100644 --- a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs +++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs @@ -48,9 +48,9 @@ impl Display for TrigonometricFunction { } pub(super) fn apply_trigonometric_function( - s: &Series, + s: &Column, trig_function: TrigonometricFunction, -) -> PolarsResult { +) -> PolarsResult { use DataType::*; match s.dtype() { Float32 => { @@ -69,7 +69,7 @@ pub(super) fn apply_trigonometric_function( } } -pub(super) fn apply_arctan2(s: &mut [Series]) -> PolarsResult> { +pub(super) fn apply_arctan2(s: &mut [Column]) -> PolarsResult> { let y = &s[0]; let x = &s[1]; @@ -77,8 +77,8 @@ pub(super) fn apply_arctan2(s: &mut [Series]) -> PolarsResult> { let x_len = x.len(); match (y_len, x_len) { - (1, _) | (_, 1) => arctan2_on_series(y, x), - (len_a, len_b) if len_a == len_b => arctan2_on_series(y, x), + (1, _) | (_, 1) => arctan2_on_columns(y, x), + (len_a, len_b) if len_a == len_b => arctan2_on_columns(y, x), _ => polars_bail!( ComputeError: "y shape: {} in `arctan2` expression does not match that of x: {}", @@ -87,7 +87,7 @@ pub(super) fn apply_arctan2(s: &mut [Series]) -> PolarsResult> { } } -fn arctan2_on_series(y: &Series, x: &Series) -> PolarsResult> { +fn arctan2_on_columns(y: &Column, x: &Column) -> PolarsResult> { use DataType::*; match y.dtype() { Float32 => { @@ -100,36 +100,36 @@ fn arctan2_on_series(y: &Series, x: &Series) -> PolarsResult> { }, _ => { let y = y.cast(&DataType::Float64)?; - arctan2_on_series(&y, x) + arctan2_on_columns(&y, x) }, } } -fn arctan2_on_floats(y: &ChunkedArray, x: &Series) -> PolarsResult> +fn arctan2_on_floats(y: &ChunkedArray, x: &Column) -> PolarsResult> where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { let dtype = T::get_dtype(); let x = x.cast(&dtype)?; - let x = y.unpack_series_matching_type(&x).unwrap(); + let x = y.unpack_series_matching_type(x.as_materialized_series()).unwrap(); if x.len() == 1 { let x_value = x .get(0) .ok_or_else(|| polars_err!(ComputeError: "arctan2 x value is null"))?; - Ok(Some(y.apply_values(|v| v.atan2(x_value)).into_series())) + Ok(Some(y.apply_values(|v| v.atan2(x_value)).into_column())) } else if y.len() == 1 { let y_value = y .get(0) .ok_or_else(|| polars_err!(ComputeError: "arctan2 y value is null"))?; - Ok(Some(x.apply_values(|v| y_value.atan2(v)).into_series())) + Ok(Some(x.apply_values(|v| y_value.atan2(v)).into_column())) } else { Ok(Some( - polars_core::prelude::arity::binary(y, x, atan2_kernel).into_series(), + polars_core::prelude::arity::binary(y, x, atan2_kernel).into_column(), )) } } @@ -137,11 +137,11 @@ where fn apply_trigonometric_function_to_float( ca: &ChunkedArray, trig_function: TrigonometricFunction, -) -> PolarsResult +) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { match trig_function { TrigonometricFunction::Cos => cos(ca), @@ -162,137 +162,137 @@ where } } -fn cos(ca: &ChunkedArray) -> PolarsResult +fn cos(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.cos()).into_series()) + Ok(ca.apply_values(|v| v.cos()).into_column()) } -fn cot(ca: &ChunkedArray) -> PolarsResult +fn cot(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.tan().powi(-1)).into_series()) + Ok(ca.apply_values(|v| v.tan().powi(-1)).into_column()) } -fn sin(ca: &ChunkedArray) -> PolarsResult +fn sin(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.sin()).into_series()) + Ok(ca.apply_values(|v| v.sin()).into_column()) } -fn tan(ca: &ChunkedArray) -> PolarsResult +fn tan(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.tan()).into_series()) + Ok(ca.apply_values(|v| v.tan()).into_column()) } -fn arccos(ca: &ChunkedArray) -> PolarsResult +fn arccos(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.acos()).into_series()) + Ok(ca.apply_values(|v| v.acos()).into_column()) } -fn arcsin(ca: &ChunkedArray) -> PolarsResult +fn arcsin(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.asin()).into_series()) + Ok(ca.apply_values(|v| v.asin()).into_column()) } -fn arctan(ca: &ChunkedArray) -> PolarsResult +fn arctan(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.atan()).into_series()) + Ok(ca.apply_values(|v| v.atan()).into_column()) } -fn cosh(ca: &ChunkedArray) -> PolarsResult +fn cosh(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.cosh()).into_series()) + Ok(ca.apply_values(|v| v.cosh()).into_column()) } -fn sinh(ca: &ChunkedArray) -> PolarsResult +fn sinh(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.sinh()).into_series()) + Ok(ca.apply_values(|v| v.sinh()).into_column()) } -fn tanh(ca: &ChunkedArray) -> PolarsResult +fn tanh(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.tanh()).into_series()) + Ok(ca.apply_values(|v| v.tanh()).into_column()) } -fn arccosh(ca: &ChunkedArray) -> PolarsResult +fn arccosh(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.acosh()).into_series()) + Ok(ca.apply_values(|v| v.acosh()).into_column()) } -fn arcsinh(ca: &ChunkedArray) -> PolarsResult +fn arcsinh(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.asinh()).into_series()) + Ok(ca.apply_values(|v| v.asinh()).into_column()) } -fn arctanh(ca: &ChunkedArray) -> PolarsResult +fn arctanh(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.atanh()).into_series()) + Ok(ca.apply_values(|v| v.atanh()).into_column()) } -fn degrees(ca: &ChunkedArray) -> PolarsResult +fn degrees(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.to_degrees()).into_series()) + Ok(ca.apply_values(|v| v.to_degrees()).into_column()) } -fn radians(ca: &ChunkedArray) -> PolarsResult +fn radians(ca: &ChunkedArray) -> PolarsResult where T: PolarsFloatType, T::Native: Float, - ChunkedArray: IntoSeries, + ChunkedArray: IntoColumn, { - Ok(ca.apply_values(|v| v.to_radians()).into_series()) + Ok(ca.apply_values(|v| v.to_radians()).into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/unique.rs b/crates/polars-plan/src/dsl/function_expr/unique.rs index 68b665056444..c9f22a841f37 100644 --- a/crates/polars-plan/src/dsl/function_expr/unique.rs +++ b/crates/polars-plan/src/dsl/function_expr/unique.rs @@ -1,6 +1,6 @@ use super::*; -pub(super) fn unique(s: &Series, stable: bool) -> PolarsResult { +pub(super) fn unique(s: &Column, stable: bool) -> PolarsResult { if stable { s.unique_stable() } else { diff --git a/crates/polars-plan/src/dsl/functions/arity.rs b/crates/polars-plan/src/dsl/functions/arity.rs index 9e4c2ac73354..e3fb7b884885 100644 --- a/crates/polars-plan/src/dsl/functions/arity.rs +++ b/crates/polars-plan/src/dsl/functions/arity.rs @@ -2,9 +2,9 @@ use super::*; macro_rules! prepare_binary_function { ($f:ident) => { - move |s: &mut [Series]| { - let s0 = std::mem::take(&mut s[0]); - let s1 = std::mem::take(&mut s[1]); + move |c: &mut [Column]| { + let s0 = std::mem::take(&mut c[0]); + let s1 = std::mem::take(&mut c[1]); $f(s0, s1) } @@ -16,7 +16,7 @@ macro_rules! prepare_binary_function { /// The closure takes two arguments, each a [`Series`]. `output_type` must be the output dtype of the resulting [`Series`]. pub fn map_binary(a: Expr, b: Expr, f: F, output_type: GetOutput) -> Expr where - F: 'static + Fn(Series, Series) -> PolarsResult> + Send + Sync, + F: 'static + Fn(Column, Column) -> PolarsResult> + Send + Sync, { let function = prepare_binary_function!(f); a.map_many(function, &[b], output_type) @@ -27,7 +27,7 @@ where /// See [`Expr::apply`] for the difference between [`map`](Expr::map) and [`apply`](Expr::apply). pub fn apply_binary(a: Expr, b: Expr, f: F, output_type: GetOutput) -> Expr where - F: 'static + Fn(Series, Series) -> PolarsResult> + Send + Sync, + F: 'static + Fn(Column, Column) -> PolarsResult> + Send + Sync, { let function = prepare_binary_function!(f); a.apply_many(function, &[b], output_type) diff --git a/crates/polars-plan/src/dsl/functions/horizontal.rs b/crates/polars-plan/src/dsl/functions/horizontal.rs index eb0c79b3b0f7..b792c98956b3 100644 --- a/crates/polars-plan/src/dsl/functions/horizontal.rs +++ b/crates/polars-plan/src/dsl/functions/horizontal.rs @@ -22,23 +22,23 @@ fn cum_fold_dtype() -> GetOutput { /// Accumulate over multiple columns horizontally / row wise. pub fn fold_exprs(acc: Expr, f: F, exprs: E) -> Expr where - F: 'static + Fn(Series, Series) -> PolarsResult> + Send + Sync + Clone, + F: 'static + Fn(Column, Column) -> PolarsResult> + Send + Sync + Clone, E: AsRef<[Expr]>, { let mut exprs = exprs.as_ref().to_vec(); exprs.push(acc); - let function = SpecialEq::new(Arc::new(move |series: &mut [Series]| { - let mut series = series.to_vec(); - let mut acc = series.pop().unwrap(); + let function = SpecialEq::new(Arc::new(move |columns: &mut [Column]| { + let mut columns = columns.to_vec(); + let mut acc = columns.pop().unwrap(); - for s in series { - if let Some(a) = f(acc.clone(), s)? { + for c in columns { + if let Some(a) = f(acc.clone(), c)? { acc = a } } Ok(Some(acc)) - }) as Arc); + }) as Arc); Expr::AnonymousFunction { input: exprs, @@ -62,20 +62,20 @@ where /// `collect` is called. pub fn reduce_exprs(f: F, exprs: E) -> Expr where - F: 'static + Fn(Series, Series) -> PolarsResult> + Send + Sync + Clone, + F: 'static + Fn(Column, Column) -> PolarsResult> + Send + Sync + Clone, E: AsRef<[Expr]>, { let exprs = exprs.as_ref().to_vec(); - let function = SpecialEq::new(Arc::new(move |series: &mut [Series]| { - let mut s_iter = series.iter(); + let function = SpecialEq::new(Arc::new(move |columns: &mut [Column]| { + let mut c_iter = columns.iter(); - match s_iter.next() { + match c_iter.next() { Some(acc) => { let mut acc = acc.clone(); - for s in s_iter { - if let Some(a) = f(acc.clone(), s.clone())? { + for c in c_iter { + if let Some(a) = f(acc.clone(), c.clone())? { acc = a } } @@ -83,7 +83,7 @@ where }, None => Err(polars_err!(ComputeError: "`reduce` did not have any expressions to fold")), } - }) as Arc); + }) as Arc); Expr::AnonymousFunction { input: exprs, @@ -104,34 +104,34 @@ where #[cfg(feature = "dtype-struct")] pub fn cum_reduce_exprs(f: F, exprs: E) -> Expr where - F: 'static + Fn(Series, Series) -> PolarsResult> + Send + Sync + Clone, + F: 'static + Fn(Column, Column) -> PolarsResult> + Send + Sync + Clone, E: AsRef<[Expr]>, { let exprs = exprs.as_ref().to_vec(); - let function = SpecialEq::new(Arc::new(move |series: &mut [Series]| { - let mut s_iter = series.iter(); + let function = SpecialEq::new(Arc::new(move |columns: &mut [Column]| { + let mut c_iter = columns.iter(); - match s_iter.next() { + match c_iter.next() { Some(acc) => { let mut acc = acc.clone(); let mut result = vec![acc.clone()]; - for s in s_iter { - let name = s.name().clone(); - if let Some(a) = f(acc.clone(), s.clone())? { + for c in c_iter { + let name = c.name().clone(); + if let Some(a) = f(acc.clone(), c.clone())? { acc = a; } acc.rename(name); result.push(acc.clone()); } - StructChunked::from_series(acc.name().clone(), &result) - .map(|ca| Some(ca.into_series())) + StructChunked::from_columns(acc.name().clone(), &result) + .map(|ca| Some(ca.into_column())) }, None => Err(polars_err!(ComputeError: "`reduce` did not have any expressions to fold")), } - }) as Arc); + }) as Arc); Expr::AnonymousFunction { input: exprs, @@ -152,32 +152,32 @@ where #[cfg(feature = "dtype-struct")] pub fn cum_fold_exprs(acc: Expr, f: F, exprs: E, include_init: bool) -> Expr where - F: 'static + Fn(Series, Series) -> PolarsResult> + Send + Sync + Clone, + F: 'static + Fn(Column, Column) -> PolarsResult> + Send + Sync + Clone, E: AsRef<[Expr]>, { let mut exprs = exprs.as_ref().to_vec(); exprs.push(acc); - let function = SpecialEq::new(Arc::new(move |series: &mut [Series]| { - let mut series = series.to_vec(); - let mut acc = series.pop().unwrap(); + let function = SpecialEq::new(Arc::new(move |columns: &mut [Column]| { + let mut columns = columns.to_vec(); + let mut acc = columns.pop().unwrap(); let mut result = vec![]; if include_init { result.push(acc.clone()) } - for s in series { - let name = s.name().clone(); - if let Some(a) = f(acc.clone(), s)? { + for c in columns { + let name = c.name().clone(); + if let Some(a) = f(acc.clone(), c)? { acc = a; acc.rename(name); result.push(acc.clone()); } } - StructChunked::from_series(acc.name().clone(), &result).map(|ca| Some(ca.into_series())) - }) as Arc); + StructChunked::from_columns(acc.name().clone(), &result).map(|ca| Some(ca.into_column())) + }) as Arc); Expr::AnonymousFunction { input: exprs, diff --git a/crates/polars-plan/src/dsl/functions/repeat.rs b/crates/polars-plan/src/dsl/functions/repeat.rs index 5c3084fb7caf..21d27a542e99 100644 --- a/crates/polars-plan/src/dsl/functions/repeat.rs +++ b/crates/polars-plan/src/dsl/functions/repeat.rs @@ -5,7 +5,7 @@ use super::*; /// Generally you won't need this function, as `lit(value)` already represents a column containing /// only `value` whose length is automatically set to the correct number of rows. pub fn repeat>(value: E, n: Expr) -> Expr { - let function = |s: Series, n: Series| { + let function = |s: Column, n: Column| { polars_ensure!( n.dtype().is_integer(), SchemaMismatch: "expected expression of dtype 'integer', got '{}'", n.dtype() diff --git a/crates/polars-plan/src/dsl/list.rs b/crates/polars-plan/src/dsl/list.rs index 11e825a7ec1f..fb0c7a83b463 100644 --- a/crates/polars-plan/src/dsl/list.rs +++ b/crates/polars-plan/src/dsl/list.rs @@ -295,7 +295,7 @@ impl ListNameSpace { move |s| { s.list()? .to_struct(n_fields, name_generator.clone()) - .map(|s| Some(s.into_series())) + .map(|s| Some(s.into_column())) }, // we don't yet know the fields GetOutput::map_dtype(move |dt: &DataType| { diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index 895020ce43f5..a2cf069e1db3 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -323,10 +323,10 @@ impl Expr { }; self.function_with_options( - move |s: Series| { - Ok(Some(Series::new( - s.name().clone(), - &[s.arg_min().map(|idx| idx as u32)], + move |c: Column| { + Ok(Some(Column::new( + c.name().clone(), + &[c.as_materialized_series().arg_min().map(|idx| idx as u32)], ))) }, GetOutput::from_type(IDX_DTYPE), @@ -344,10 +344,12 @@ impl Expr { }; self.function_with_options( - move |s: Series| { - Ok(Some(Series::new( - s.name().clone(), - &[s.arg_max().map(|idx| idx as IdxSize)], + move |c: Column| { + Ok(Some(Column::new( + c.name().clone(), + &[c.as_materialized_series() + .arg_max() + .map(|idx| idx as IdxSize)], ))) }, GetOutput::from_type(IDX_DTYPE), @@ -364,7 +366,13 @@ impl Expr { }; self.function_with_options( - move |s: Series| Ok(Some(s.arg_sort(sort_options).into_series())), + move |c: Column| { + Ok(Some( + c.as_materialized_series() + .arg_sort(sort_options) + .into_column(), + )) + }, GetOutput::from_type(IDX_DTYPE), options, ) @@ -535,9 +543,9 @@ impl Expr { /// the correct output_type. If None given the output type of the input expr is used. pub fn map(self, function: F, output_type: GetOutput) -> Self where - F: Fn(Series) -> PolarsResult> + 'static + Send + Sync, + F: Fn(Column) -> PolarsResult> + 'static + Send + Sync, { - let f = move |s: &mut [Series]| function(std::mem::take(&mut s[0])); + let f = move |c: &mut [Column]| function(std::mem::take(&mut c[0])); Expr::AnonymousFunction { input: vec![self], @@ -568,7 +576,7 @@ impl Expr { /// See the [`Expr::map`] function for the differences between [`map`](Expr::map) and [`apply`](Expr::apply). pub fn map_many(self, function: F, arguments: &[Expr], output_type: GetOutput) -> Self where - F: Fn(&mut [Series]) -> PolarsResult> + 'static + Send + Sync, + F: Fn(&mut [Column]) -> PolarsResult> + 'static + Send + Sync, { let mut input = vec![self]; input.extend_from_slice(arguments); @@ -594,9 +602,9 @@ impl Expr { /// * `map_list` should be used when the function expects a list aggregated series. pub fn map_list(self, function: F, output_type: GetOutput) -> Self where - F: Fn(Series) -> PolarsResult> + 'static + Send + Sync, + F: Fn(Column) -> PolarsResult> + 'static + Send + Sync, { - let f = move |s: &mut [Series]| function(std::mem::take(&mut s[0])); + let f = move |c: &mut [Column]| function(std::mem::take(&mut c[0])); Expr::AnonymousFunction { input: vec![self], @@ -618,9 +626,9 @@ impl Expr { options: FunctionOptions, ) -> Self where - F: Fn(Series) -> PolarsResult> + 'static + Send + Sync, + F: Fn(Column) -> PolarsResult> + 'static + Send + Sync, { - let f = move |s: &mut [Series]| function(std::mem::take(&mut s[0])); + let f = move |c: &mut [Column]| function(std::mem::take(&mut c[0])); Expr::AnonymousFunction { input: vec![self], @@ -641,9 +649,9 @@ impl Expr { /// * `apply` should be used for operations that work on a group of data. e.g. `sum`, `count`, etc. pub fn apply(self, function: F, output_type: GetOutput) -> Self where - F: Fn(Series) -> PolarsResult> + 'static + Send + Sync, + F: Fn(Column) -> PolarsResult> + 'static + Send + Sync, { - let f = move |s: &mut [Series]| function(std::mem::take(&mut s[0])); + let f = move |c: &mut [Column]| function(std::mem::take(&mut c[0])); Expr::AnonymousFunction { input: vec![self], @@ -673,7 +681,7 @@ impl Expr { /// See the [`Expr::apply`] function for the differences between [`map`](Expr::map) and [`apply`](Expr::apply). pub fn apply_many(self, function: F, arguments: &[Expr], output_type: GetOutput) -> Self where - F: Fn(&mut [Series]) -> PolarsResult> + 'static + Send + Sync, + F: Fn(&mut [Column]) -> PolarsResult> + 'static + Send + Sync, { let mut input = vec![self]; input.extend_from_slice(arguments); @@ -829,8 +837,12 @@ impl Expr { }; self.function_with_options( - move |s: Series| { - Some(s.product().map(|sc| sc.into_series(s.name().clone()))).transpose() + move |c: Column| { + Some( + c.product() + .map(|sc| sc.into_series(c.name().clone()).into_column()), + ) + .transpose() }, GetOutput::map_dtype(|dt| { use DataType as T; @@ -1463,7 +1475,12 @@ impl Expr { options: RollingOptionsFixedWindow, ) -> Expr { self.apply( - move |s| s.rolling_map(f.as_ref(), options.clone()).map(Some), + move |c: Column| { + c.as_materialized_series() + .rolling_map(f.as_ref(), options.clone()) + .map(Column::from) + .map(Some) + }, output_type, ) .with_fmt("rolling_map") @@ -1478,24 +1495,24 @@ impl Expr { F: 'static + FnMut(&mut Float64Chunked) -> Option + Send + Sync + Copy, { self.apply( - move |s| { - let out = match s.dtype() { - DataType::Float64 => s + move |c: Column| { + let out = match c.dtype() { + DataType::Float64 => c .f64() .unwrap() .rolling_map_float(window_size, f) - .map(|ca| ca.into_series()), - _ => s + .map(|ca| ca.into_column()), + _ => c .cast(&DataType::Float64)? .f64() .unwrap() .rolling_map_float(window_size, f) - .map(|ca| ca.into_series()), + .map(|ca| ca.into_column()), }?; - if let DataType::Float32 = s.dtype() { - out.cast(&DataType::Float32).map(Some) + if let DataType::Float32 = c.dtype() { + out.cast(&DataType::Float32).map(Column::from).map(Some) } else { - Ok(Some(out)) + Ok(Some(out.into())) } }, GetOutput::map_field(|field| { @@ -1952,7 +1969,7 @@ impl Expr { /// the correct output_type. If None given the output type of the input expr is used. pub fn map_multiple(function: F, expr: E, output_type: GetOutput) -> Expr where - F: Fn(&mut [Series]) -> PolarsResult> + 'static + Send + Sync, + F: Fn(&mut [Column]) -> PolarsResult> + 'static + Send + Sync, E: AsRef<[Expr]>, { let input = expr.as_ref().to_vec(); @@ -1978,7 +1995,7 @@ where /// * `map_list_mul` should be used when the function expects a list aggregated series. pub fn map_list_multiple(function: F, expr: E, output_type: GetOutput) -> Expr where - F: Fn(&mut [Series]) -> PolarsResult> + 'static + Send + Sync, + F: Fn(&mut [Column]) -> PolarsResult> + 'static + Send + Sync, E: AsRef<[Expr]>, { let input = expr.as_ref().to_vec(); @@ -2012,7 +2029,7 @@ pub fn apply_multiple( returns_scalar: bool, ) -> Expr where - F: Fn(&mut [Series]) -> PolarsResult> + 'static + Send + Sync, + F: Fn(&mut [Column]) -> PolarsResult> + 'static + Send + Sync, E: AsRef<[Expr]>, { let input = expr.as_ref().to_vec(); diff --git a/crates/polars-plan/src/dsl/name.rs b/crates/polars-plan/src/dsl/name.rs index 70bbc830b3c0..1df62a767721 100644 --- a/crates/polars-plan/src/dsl/name.rs +++ b/crates/polars-plan/src/dsl/name.rs @@ -78,7 +78,7 @@ impl ExprNameNameSpace { .collect::>(); let mut out = StructChunked::from_series(s.name().clone(), &fields)?; out.zip_outer_validity(s); - Ok(Some(out.into_series())) + Ok(Some(out.into_column())) }, GetOutput::map_dtype(move |dt| match dt { DataType::Struct(fds) => { diff --git a/crates/polars-plan/src/dsl/python_udf.rs b/crates/polars-plan/src/dsl/python_udf.rs index b105f62df482..cdd8194793e6 100644 --- a/crates/polars-plan/src/dsl/python_udf.rs +++ b/crates/polars-plan/src/dsl/python_udf.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use polars_core::datatypes::{DataType, Field}; use polars_core::error::*; use polars_core::frame::DataFrame; +use polars_core::frame::column::Column; use polars_core::prelude::Series; use polars_core::schema::Schema; use pyo3::prelude::*; @@ -20,7 +21,7 @@ use crate::prelude::*; // Will be overwritten on Python Polars start up. pub static mut CALL_SERIES_UDF_PYTHON: Option< - fn(s: Series, lambda: &PyObject) -> PolarsResult, + fn(s: Column, lambda: &PyObject) -> PolarsResult, > = None; pub static mut CALL_DF_UDF_PYTHON: Option< fn(s: DataFrame, lambda: &PyObject) -> PolarsResult, @@ -124,7 +125,7 @@ impl PythonUdfExpression { } #[cfg(feature = "serde")] - pub(crate) fn try_deserialize(buf: &[u8]) -> PolarsResult> { + pub(crate) fn try_deserialize(buf: &[u8]) -> PolarsResult> { debug_assert!(buf.starts_with(MAGIC_BYTE_MARK)); // skip header let buf = &buf[MAGIC_BYTE_MARK.len()..]; @@ -147,7 +148,7 @@ impl PythonUdfExpression { output_type, is_elementwise, returns_scalar, - )) as Arc) + )) as Arc) }) } } @@ -163,8 +164,8 @@ impl DataFrameUdf for PythonFunction { } } -impl SeriesUdf for PythonUdfExpression { - fn call_udf(&self, s: &mut [Series]) -> PolarsResult> { +impl ColumnsUdf for PythonUdfExpression { + fn call_udf(&self, s: &mut [Column]) -> PolarsResult> { let func = unsafe { CALL_SERIES_UDF_PYTHON.unwrap() }; let output_type = self diff --git a/crates/polars-plan/src/dsl/udf.rs b/crates/polars-plan/src/dsl/udf.rs index fe01cab03ea2..74371639a54a 100644 --- a/crates/polars-plan/src/dsl/udf.rs +++ b/crates/polars-plan/src/dsl/udf.rs @@ -5,7 +5,7 @@ use polars_core::prelude::Field; use polars_core::schema::Schema; use polars_utils::pl_str::PlSmallStr; -use super::{Expr, GetOutput, SeriesUdf, SpecialEq}; +use super::{Expr, GetOutput, ColumnsUdf, SpecialEq}; use crate::prelude::{Context, FunctionOptions}; /// Represents a user-defined function @@ -18,7 +18,7 @@ pub struct UserDefinedFunction { /// The function output type. pub return_type: GetOutput, /// The function implementation. - pub fun: SpecialEq>, + pub fun: SpecialEq>, /// Options for the function. pub options: FunctionOptions, } @@ -40,7 +40,7 @@ impl UserDefinedFunction { name: PlSmallStr, input_fields: Vec, return_type: GetOutput, - fun: impl SeriesUdf + 'static, + fun: impl ColumnsUdf + 'static, ) -> Self { Self { name, diff --git a/crates/polars-plan/src/plans/aexpr/mod.rs b/crates/polars-plan/src/plans/aexpr/mod.rs index 4be23e79df14..42bfff7cabab 100644 --- a/crates/polars-plan/src/plans/aexpr/mod.rs +++ b/crates/polars-plan/src/plans/aexpr/mod.rs @@ -174,7 +174,7 @@ pub enum AExpr { }, AnonymousFunction { input: Vec, - function: SpecialEq>, + function: SpecialEq>, output_type: GetOutput, options: FunctionOptions, }, diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index 2b66907d6916..de2ac244ef29 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -82,7 +82,7 @@ pub fn count_rows( |_| polars_err!(ComputeError: "count of {} exceeded maximum row size", count), )?; let column_name = alias.unwrap_or(PlSmallStr::from_static(crate::constants::LEN)); - DataFrame::new(vec![Series::new(column_name, [count])]) + DataFrame::new(vec![Column::new(column_name, [count])]) } } diff --git a/crates/polars-plan/src/plans/functions/merge_sorted.rs b/crates/polars-plan/src/plans/functions/merge_sorted.rs index ffc9e1f04df6..605a628c3c88 100644 --- a/crates/polars-plan/src/plans/functions/merge_sorted.rs +++ b/crates/polars-plan/src/plans/functions/merge_sorted.rs @@ -11,9 +11,10 @@ pub(super) fn merge_sorted(df: &DataFrame, column: &str) -> PolarsResult>(), df.get_columns() @@ -21,9 +22,10 @@ pub(super) fn merge_sorted(df: &DataFrame, column: &str) -> PolarsResult>(), ) @@ -34,5 +36,11 @@ pub(super) fn merge_sorted(df: &DataFrame, column: &str) -> PolarsResult, + group_by: Vec, options: &RollingGroupOptions, - ) -> PolarsResult<(Series, Vec, GroupsProxy)>; + ) -> PolarsResult<(Column, Vec, GroupsProxy)>; fn group_by_dynamic( &self, - group_by: Vec, + group_by: Vec, options: &DynamicGroupOptions, - ) -> PolarsResult<(Series, Vec, GroupsProxy)>; + ) -> PolarsResult<(Column, Vec, GroupsProxy)>; } impl PolarsTemporalGroupby for DataFrame { fn rolling( &self, - group_by: Vec, + group_by: Vec, options: &RollingGroupOptions, - ) -> PolarsResult<(Series, Vec, GroupsProxy)> { + ) -> PolarsResult<(Column, Vec, GroupsProxy)> { Wrap(self).rolling(group_by, options) } fn group_by_dynamic( &self, - group_by: Vec, + group_by: Vec, options: &DynamicGroupOptions, - ) -> PolarsResult<(Series, Vec, GroupsProxy)> { + ) -> PolarsResult<(Column, Vec, GroupsProxy)> { Wrap(self).group_by_dynamic(group_by, options) } } @@ -116,9 +116,9 @@ impl PolarsTemporalGroupby for DataFrame { impl Wrap<&DataFrame> { fn rolling( &self, - group_by: Vec, + group_by: Vec, options: &RollingGroupOptions, - ) -> PolarsResult<(Series, Vec, GroupsProxy)> { + ) -> PolarsResult<(Column, Vec, GroupsProxy)> { polars_ensure!( !options.period.is_zero() && !options.period.negative, ComputeError: @@ -128,7 +128,7 @@ impl Wrap<&DataFrame> { if group_by.is_empty() { // If by is given, the column must be sorted in the 'by' arg, which we can not check now // this will be checked when the groups are materialized. - time.ensure_sorted_arg("rolling")?; + time.as_materialized_series().ensure_sorted_arg("rolling")?; } let time_type = time.dtype(); @@ -137,7 +137,7 @@ impl Wrap<&DataFrame> { ensure_duration_matches_dtype(options.offset, time_type, "offset")?; use DataType::*; - let (dt, tu, tz): (Series, TimeUnit, Option) = match time_type { + let (dt, tu, tz): (Column, TimeUnit, Option) = match time_type { Datetime(tu, tz) => (time.clone(), *tu, tz.clone()), Date => ( time.cast(&Datetime(TimeUnit::Milliseconds, None))?, @@ -190,14 +190,14 @@ impl Wrap<&DataFrame> { /// Returns: time_keys, keys, groupsproxy. fn group_by_dynamic( &self, - group_by: Vec, + group_by: Vec, options: &DynamicGroupOptions, - ) -> PolarsResult<(Series, Vec, GroupsProxy)> { + ) -> PolarsResult<(Column, Vec, GroupsProxy)> { let time = self.0.column(&options.index_column)?.rechunk(); if group_by.is_empty() { // If by is given, the column must be sorted in the 'by' arg, which we can not check now // this will be checked when the groups are materialized. - time.ensure_sorted_arg("group_by_dynamic")?; + time.as_materialized_series().ensure_sorted_arg("group_by_dynamic")?; } let time_type = time.dtype(); @@ -260,12 +260,12 @@ impl Wrap<&DataFrame> { fn impl_group_by_dynamic( &self, - mut dt: Series, - mut by: Vec, + mut dt: Column, + mut by: Vec, options: &DynamicGroupOptions, tu: TimeUnit, time_type: &DataType, - ) -> PolarsResult<(Series, Vec, GroupsProxy)> { + ) -> PolarsResult<(Column, Vec, GroupsProxy)> { polars_ensure!(!options.every.negative, ComputeError: "'every' argument must be positive"); if dt.is_empty() { return dt.cast(time_type).map(|s| (s, by, GroupsProxy::default())); @@ -501,12 +501,12 @@ impl Wrap<&DataFrame> { lower.set_sorted_flag(IsSorted::Ascending); upper.set_sorted_flag(IsSorted::Ascending); } - by.push(lower.into_datetime(tu, tz.clone()).into_series()); - by.push(upper.into_datetime(tu, tz.clone()).into_series()); + by.push(lower.into_datetime(tu, tz.clone()).into_column()); + by.push(upper.into_datetime(tu, tz.clone()).into_column()); } dt.into_datetime(tu, None) - .into_series() + .into_column() .cast(time_type) .map(|s| (s, by, groups)) } @@ -514,13 +514,13 @@ impl Wrap<&DataFrame> { /// Returns: time_keys, keys, groupsproxy fn impl_rolling( &self, - dt: Series, - group_by: Vec, + dt: Column, + group_by: Vec, options: &RollingGroupOptions, tu: TimeUnit, tz: Option, time_type: &DataType, - ) -> PolarsResult<(Series, Vec, GroupsProxy)> { + ) -> PolarsResult<(Column, Vec, GroupsProxy)> { let mut dt = dt.rechunk(); let groups = if group_by.is_empty() { @@ -691,9 +691,9 @@ mod test { None, &StringChunked::from_iter(std::iter::once("raise")), )? - .into_series(); + .into_column(); date.set_sorted_flag(IsSorted::Ascending); - let a = Series::new("a".into(), [3, 7, 5, 9, 2, 1]); + let a = Column::new("a".into(), [3, 7, 5, 9, 2, 1]); let df = DataFrame::new(vec![date, a.clone()])?; let (_, _, groups) = df @@ -709,7 +709,7 @@ mod test { .unwrap(); let sum = unsafe { a.agg_sum(&groups) }; - let expected = Series::new("".into(), [3, 10, 15, 24, 11, 1]); + let expected = Column::new("".into(), [3, 10, 15, 24, 11, 1]); assert_eq!(sum, expected); } @@ -737,10 +737,10 @@ mod test { None, &StringChunked::from_iter(std::iter::once("raise")), )? - .into_series(); + .into_column(); date.set_sorted_flag(IsSorted::Ascending); - let a = Series::new("a".into(), [3, 7, 5, 9, 2, 1]); + let a = Column::new("a".into(), [3, 7, 5, 9, 2, 1]); let df = DataFrame::new(vec![date, a.clone()])?; let (_, _, groups) = df @@ -755,13 +755,13 @@ mod test { ) .unwrap(); - let nulls = Series::new( + let nulls = Column::new( "".into(), [Some(3), Some(7), None, Some(9), Some(2), Some(1)], ); let min = unsafe { a.agg_min(&groups) }; - let expected = Series::new("".into(), [3, 3, 3, 3, 2, 1]); + let expected = Column::new("".into(), [3, 3, 3, 3, 2, 1]); assert_eq!(min, expected); // Expected for nulls is equality. @@ -769,29 +769,29 @@ mod test { assert_eq!(min, expected); let max = unsafe { a.agg_max(&groups) }; - let expected = Series::new("".into(), [3, 7, 7, 9, 9, 1]); + let expected = Column::new("".into(), [3, 7, 7, 9, 9, 1]); assert_eq!(max, expected); let max = unsafe { nulls.agg_max(&groups) }; assert_eq!(max, expected); let var = unsafe { a.agg_var(&groups, 1) }; - let expected = Series::new( + let expected = Column::new( "".into(), [0.0, 8.0, 4.000000000000002, 6.666666666666667, 24.5, 0.0], ); - assert!(abs(&(var - expected)?).unwrap().lt(1e-12).unwrap().all()); + assert!(abs(&(var - expected)?.as_materialized_series()).unwrap().lt(1e-12).unwrap().all()); let var = unsafe { nulls.agg_var(&groups, 1) }; - let expected = Series::new("".into(), [0.0, 8.0, 8.0, 9.333333333333343, 24.5, 0.0]); - assert!(abs(&(var - expected)?).unwrap().lt(1e-12).unwrap().all()); + let expected = Column::new("".into(), [0.0, 8.0, 8.0, 9.333333333333343, 24.5, 0.0]); + assert!(abs(&(var - expected)?.as_materialized_series()).unwrap().lt(1e-12).unwrap().all()); let quantile = unsafe { a.agg_quantile(&groups, 0.5, QuantileInterpolOptions::Linear) }; - let expected = Series::new("".into(), [3.0, 5.0, 5.0, 6.0, 5.5, 1.0]); + let expected = Column::new("".into(), [3.0, 5.0, 5.0, 6.0, 5.5, 1.0]); assert_eq!(quantile, expected); let quantile = unsafe { nulls.agg_quantile(&groups, 0.5, QuantileInterpolOptions::Linear) }; - let expected = Series::new("".into(), [3.0, 5.0, 5.0, 7.0, 5.5, 1.0]); + let expected = Column::new("".into(), [3.0, 5.0, 5.0, 7.0, 5.5, 1.0]); assert_eq!(quantile, expected); Ok(()) @@ -820,9 +820,9 @@ mod test { TimeUnit::Milliseconds, None, )? - .into_series(); + .into_column(); - let groups = Series::new("groups".into(), ["a", "a", "a", "b", "b", "a", "a"]); + let groups = Column::new("groups".into(), ["a", "a", "a", "b", "b", "a", "a"]); let df = DataFrame::new(vec![range, groups.clone()]).unwrap(); let (time_key, mut keys, groups) = df @@ -874,7 +874,7 @@ mod test { TimeUnit::Milliseconds, None, )? - .into_series(); + .into_column(); assert_eq!(&upper, &range); let upper = out.column("_lower_boundary").unwrap().slice(0, 3); @@ -899,7 +899,7 @@ mod test { TimeUnit::Milliseconds, None, )? - .into_series(); + .into_column(); assert_eq!(&upper, &range); let expected = GroupsProxy::Idx( @@ -940,9 +940,9 @@ mod test { TimeUnit::Milliseconds, None, )? - .into_series(); + .into_column(); - let groups = Series::new("groups".into(), ["a", "a", "a", "b", "b", "a", "a"]); + let groups = Column::new("groups".into(), ["a", "a", "a", "b", "b", "a", "a"]); let df = DataFrame::new(vec![range, groups.clone()]).unwrap(); let (mut time_key, keys, _groups) = df diff --git a/crates/polars-time/src/upsample.rs b/crates/polars-time/src/upsample.rs index 37119317ccfa..47fef8180751 100644 --- a/crates/polars-time/src/upsample.rs +++ b/crates/polars-time/src/upsample.rs @@ -163,7 +163,7 @@ fn upsample_impl( Ok(out) } else if by.is_empty() { let index_column = source.column(index_column)?; - upsample_single_impl(source, index_column, every) + upsample_single_impl(source, index_column.as_materialized_series(), every) } else { let gb = if stable { source.group_by_stable(by) @@ -173,7 +173,7 @@ fn upsample_impl( // don't parallelize this, this may SO on large data. gb?.apply(|df| { let index_column = df.column(index_column)?; - upsample_single_impl(&df, index_column, every) + upsample_single_impl(&df, index_column.as_materialized_series(), every) }) } } From 135c92f5fbc89aae1cd17ddd370501f4e827edbb Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 13:49:22 +0200 Subject: [PATCH 03/42] finish polars-expr --- crates/polars-expr/src/expressions/apply.rs | 93 +++++++++++++++----- crates/polars-expr/src/expressions/column.rs | 23 +++-- crates/polars-expr/src/expressions/sortby.rs | 28 ++++-- crates/polars-expr/src/expressions/window.rs | 11 ++- crates/polars-expr/src/planner.rs | 6 +- crates/polars-expr/src/state/node_timer.rs | 4 +- 6 files changed, 122 insertions(+), 43 deletions(-) diff --git a/crates/polars-expr/src/expressions/apply.rs b/crates/polars-expr/src/expressions/apply.rs index 6b71b5df4121..0eeb8555071b 100644 --- a/crates/polars-expr/src/expressions/apply.rs +++ b/crates/polars-expr/src/expressions/apply.rs @@ -129,13 +129,13 @@ impl ApplyExpr { } } - /// Evaluates and flattens `Option` to `Series`. - fn eval_and_flatten(&self, inputs: &mut [Series]) -> PolarsResult { + /// Evaluates and flattens `Option` to `Column`. + fn eval_and_flatten(&self, inputs: &mut [Column]) -> PolarsResult { if let Some(out) = self.function.call_udf(inputs)? { Ok(out) } else { let field = self.to_field(self.input_schema.as_ref().unwrap()).unwrap(); - Ok(Series::full_null(field.name().clone(), 1, field.dtype())) + Ok(Column::full_null(field.name().clone(), 1, field.dtype())) } } fn apply_single_group_aware<'a>( @@ -157,10 +157,10 @@ impl ApplyExpr { // Create input for the function to determine the output dtype, see #3946. let agg = agg.list().unwrap(); let input_dtype = agg.inner_dtype(); - let input = Series::full_null(PlSmallStr::EMPTY, 0, input_dtype); + let input = Column::full_null(PlSmallStr::EMPTY, 0, input_dtype); let output = self.eval_and_flatten(&mut [input])?; - let ca = ListChunked::full(name, &output, 0); + let ca = ListChunked::full(name, output.as_materialized_series(), 0); return self.finish_apply_groups(ac, ca); } @@ -170,7 +170,10 @@ impl ApplyExpr { if self.pass_name_to_apply { s.rename(name.clone()); } - self.function.call_udf(&mut [s]) + Ok(self + .function + .call_udf(&mut [Column::from(s)])? + .map(|c| c.as_materialized_series().clone())) }, }; @@ -215,16 +218,27 @@ impl ApplyExpr { let (s, aggregated) = match ac.agg_state() { AggState::AggregatedList(s) => { let ca = s.list().unwrap(); - let out = ca.apply_to_inner(&|s| self.eval_and_flatten(&mut [s]))?; + let out = ca.apply_to_inner(&|s| { + self.eval_and_flatten(&mut [s.into()]) + .map(|c| c.as_materialized_series().clone()) + })?; (out.into_series(), true) }, AggState::NotAggregated(s) => { - let (out, aggregated) = (self.eval_and_flatten(&mut [s.clone()])?, false); + let (out, aggregated) = ( + self.eval_and_flatten(&mut [s.clone().into()])? + .as_materialized_series() + .clone(), + false, + ); check_map_output_len(s.len(), out.len(), &self.expr)?; (out, aggregated) }, agg_state => { - ac.with_agg_state(agg_state.try_map(|s| self.eval_and_flatten(&mut [s.clone()]))?); + ac.with_agg_state(agg_state.try_map(|s| { + self.eval_and_flatten(&mut [s.clone().into()]) + .map(|c| c.as_materialized_series().clone()) + })?); return Ok(ac); }, }; @@ -282,10 +296,12 @@ impl ApplyExpr { for iter in &mut iters { match iter.next().unwrap() { None => return Ok(None), - Some(s) => container.push(s.deep_clone()), + Some(s) => container.push(s.deep_clone().into()), } } - self.function.call_udf(&mut container) + self.function + .call_udf(&mut container) + .map(|r| r.map(|c| c.as_materialized_series().clone())) }) .collect::>()? .with_name(field.name.clone()); @@ -326,17 +342,27 @@ impl PhysicalExpr for ApplyExpr { self.inputs .par_iter() .map(f) + .map(|v| v.map(Column::from)) .collect::>>() }) } else { - self.inputs.iter().map(f).collect::>>() + self.inputs + .iter() + .map(f) + .map(|v| v.map(Column::from)) + .collect::>>() }?; if self.allow_rename { self.eval_and_flatten(&mut inputs) + .map(|c| c.as_materialized_series().clone()) } else { let in_name = inputs[0].name().clone(); - Ok(self.eval_and_flatten(&mut inputs)?.with_name(in_name)) + Ok(self + .eval_and_flatten(&mut inputs)? + .as_materialized_series() + .clone() + .with_name(in_name)) } } @@ -357,7 +383,10 @@ impl PhysicalExpr for ApplyExpr { match self.collect_groups { ApplyOptions::ApplyList => { - let s = self.eval_and_flatten(&mut [ac.aggregated()])?; + let s = self + .eval_and_flatten(&mut [ac.aggregated().into()])? + .as_materialized_series() + .clone(); ac.with_series(s, true, Some(&self.expr))?; Ok(ac) }, @@ -369,8 +398,14 @@ impl PhysicalExpr for ApplyExpr { match self.collect_groups { ApplyOptions::ApplyList => { - let mut s = acs.iter_mut().map(|ac| ac.aggregated()).collect::>(); - let s = self.eval_and_flatten(&mut s)?; + let mut s = acs + .iter_mut() + .map(|ac| ac.aggregated().into()) + .collect::>(); + let s = self + .eval_and_flatten(&mut s)? + .as_materialized_series() + .clone(); // take the first aggregation context that as that is the input series let mut ac = acs.swap_remove(0); ac.with_update_groups(UpdateGroups::WithGroupsLen); @@ -450,14 +485,18 @@ fn apply_multiple_elementwise<'a>( let other = acs[1..] .iter() - .map(|ac| ac.flat_naive().into_owned()) + .map(|ac| ac.flat_naive().into_owned().into()) .collect::>(); let out = ca.apply_to_inner(&|s| { let mut args = Vec::with_capacity(other.len() + 1); - args.push(s); + args.push(s.into()); args.extend_from_slice(&other); - Ok(function.call_udf(&mut args)?.unwrap()) + Ok(function + .call_udf(&mut args)? + .unwrap() + .as_materialized_series() + .clone()) })?; let mut ac = acs.swap_remove(0); ac.with_series(out.into_series(), true, None)?; @@ -479,10 +518,15 @@ fn apply_multiple_elementwise<'a>( ac.flat_naive().into_owned() }) + .map(Column::from) .collect::>(); let input_len = s[0].len(); - let s = function.call_udf(&mut s)?.unwrap(); + let s = function + .call_udf(&mut s)? + .unwrap() + .as_materialized_series() + .clone(); if check_lengths { check_map_output_len(input_len, s.len(), expr)?; } @@ -661,13 +705,18 @@ impl PartitionedAggregation for ApplyExpr { state: &ExecutionState, ) -> PolarsResult { let a = self.inputs[0].as_partitioned_aggregator().unwrap(); - let s = a.evaluate_partitioned(df, groups, state)?; + let s = a.evaluate_partitioned(df, groups, state)?.into(); if self.allow_rename { self.eval_and_flatten(&mut [s]) + .map(|c| c.as_materialized_series().clone()) } else { let in_name = s.name().clone(); - Ok(self.eval_and_flatten(&mut [s])?.with_name(in_name)) + Ok(self + .eval_and_flatten(&mut [s])? + .as_materialized_series() + .clone() + .with_name(in_name)) } } diff --git a/crates/polars-expr/src/expressions/column.rs b/crates/polars-expr/src/expressions/column.rs index 74a20dcdb0ba..6bac214f140c 100644 --- a/crates/polars-expr/src/expressions/column.rs +++ b/crates/polars-expr/src/expressions/column.rs @@ -33,7 +33,7 @@ impl ColumnExpr { for df in state.ext_contexts.as_ref() { let out = df.column(&self.name); if out.is_ok() { - return out.cloned(); + return out.map(Column::as_materialized_series).cloned(); } } Err(e) @@ -75,7 +75,9 @@ impl ColumnExpr { // in release we fallback to linear search #[allow(unreachable_code)] { - df.column(&self.name).cloned() + df.column(&self.name) + .map(Column::as_materialized_series) + .cloned() } } else { Ok(out.clone()) @@ -98,7 +100,9 @@ impl ColumnExpr { } // in release we fallback to linear search #[allow(unreachable_code)] - df.column(&self.name).cloned() + df.column(&self.name) + .map(Column::as_materialized_series) + .cloned() } fn process_from_state_schema( @@ -110,7 +114,9 @@ impl ColumnExpr { match schema.get_full(&self.name) { None => self.process_by_linear_search(df, state, true), Some((idx, _, _)) => match df.get_columns().get(idx) { - Some(out) => self.process_by_idx(out, state, schema, df, false), + Some(out) => { + self.process_by_idx(out.as_materialized_series(), state, schema, df, false) + }, None => self.process_by_linear_search(df, state, true), }, } @@ -125,6 +131,7 @@ impl ColumnExpr { .iter() .find(|s| s.name() == &self.name) .unwrap() + .as_materialized_series() .clone()) } } @@ -142,7 +149,13 @@ impl PhysicalExpr for ColumnExpr { // check if the schema was correct // if not do O(n) search match df.get_columns().get(idx) { - Some(out) => self.process_by_idx(out, state, schema, df, true), + Some(out) => self.process_by_idx( + out.as_materialized_series(), + state, + schema, + df, + true, + ), None => { // partitioned group_by special case if let Some(schema) = state.get_schema() { diff --git a/crates/polars-expr/src/expressions/sortby.rs b/crates/polars-expr/src/expressions/sortby.rs index 71825c971329..00ace093856e 100644 --- a/crates/polars-expr/src/expressions/sortby.rs +++ b/crates/polars-expr/src/expressions/sortby.rs @@ -152,6 +152,7 @@ fn sort_by_groups_multiple_by( let groups = sort_by_s .iter() .map(|s| unsafe { s.take_slice_unchecked(idx) }) + .map(Column::from) .collect::>(); let options = SortMultipleOptions { @@ -161,13 +162,17 @@ fn sort_by_groups_multiple_by( maintain_order, }; - let sorted_idx = groups[0].arg_sort_multiple(&groups[1..], &options).unwrap(); + let sorted_idx = groups[0] + .as_materialized_series() + .arg_sort_multiple(&groups[1..], &options) + .unwrap(); map_sorted_indices_to_group_idx(&sorted_idx, idx) }, GroupsIndicator::Slice([first, len]) => { let groups = sort_by_s .iter() .map(|s| s.slice(first as i64, len as usize)) + .map(Column::from) .collect::>(); let options = SortMultipleOptions { @@ -176,7 +181,10 @@ fn sort_by_groups_multiple_by( multithreaded, maintain_order, }; - let sorted_idx = groups[0].arg_sort_multiple(&groups[1..], &options).unwrap(); + let sorted_idx = groups[0] + .as_materialized_series() + .arg_sort_multiple(&groups[1..], &options) + .unwrap(); map_sorted_indices_to_group_slice(&sorted_idx, first) }, }; @@ -208,11 +216,13 @@ impl PhysicalExpr for SortByExpr { .by .iter() .map(|e| { - e.evaluate(df, state).map(|s| match s.dtype() { - #[cfg(feature = "dtype-categorical")] - DataType::Categorical(_, _) | DataType::Enum(_, _) => s, - _ => s.to_physical_repr().into_owned(), - }) + e.evaluate(df, state) + .map(|s| match s.dtype() { + #[cfg(feature = "dtype-categorical")] + DataType::Categorical(_, _) | DataType::Enum(_, _) => s, + _ => s.to_physical_repr().into_owned(), + }) + .map(Column::from) }) .collect::>>()?; @@ -231,7 +241,9 @@ impl PhysicalExpr for SortByExpr { ); } - s_sort_by[0].arg_sort_multiple(&s_sort_by[1..], &options) + s_sort_by[0] + .as_materialized_series() + .arg_sort_multiple(&s_sort_by[1..], &options) }; POOL.install(|| rayon::join(series_f, sorted_idx_f)) }; diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs index 47ea0847507c..5a455cf5932b 100644 --- a/crates/polars-expr/src/expressions/window.rs +++ b/crates/polars-expr/src/expressions/window.rs @@ -127,7 +127,7 @@ impl WindowExpr { out_column: Series, flattened: Series, mut ac: AggregationContext, - group_by_columns: &[Series], + group_by_columns: &[Column], gb: GroupBy, state: &ExecutionState, cache_key: &str, @@ -412,7 +412,7 @@ impl PhysicalExpr for WindowExpr { let group_by_columns = self .group_by .iter() - .map(|e| e.evaluate(df, state)) + .map(|e| e.evaluate(df, state).map(Column::from)) .collect::>>()?; // if the keys are sorted @@ -584,7 +584,12 @@ impl PhysicalExpr for WindowExpr { let right = &keys[0]; PolarsResult::Ok( group_by_columns[0] - .hash_join_left(right, JoinValidation::ManyToMany, true) + .as_materialized_series() + .hash_join_left( + right.as_materialized_series(), + JoinValidation::ManyToMany, + true, + ) .unwrap() .1, ) diff --git a/crates/polars-expr/src/planner.rs b/crates/polars-expr/src/planner.rs index 18cbc222569a..3f942cf55d59 100644 --- a/crates/polars-expr/src/planner.rs +++ b/crates/polars-expr/src/planner.rs @@ -566,9 +566,9 @@ fn create_physical_expr_inner( }, Explode(expr) => { let input = create_physical_expr_inner(*expr, ctxt, expr_arena, schema, state)?; - let function = - SpecialEq::new(Arc::new(move |s: &mut [Series]| s[0].explode().map(Some)) - as Arc); + let function = SpecialEq::new(Arc::new( + move |c: &mut [polars_core::frame::column::Column]| c[0].explode().map(Some), + ) as Arc); Ok(Arc::new(ApplyExpr::new_minimal( vec![input], function, diff --git a/crates/polars-expr/src/state/node_timer.rs b/crates/polars-expr/src/state/node_timer.rs index 8102aa8fcf83..48aa65e12c17 100644 --- a/crates/polars-expr/src/state/node_timer.rs +++ b/crates/polars-expr/src/state/node_timer.rs @@ -42,7 +42,7 @@ impl NodeTimer { polars_ensure!(!ticks.is_empty(), ComputeError: "no data to time"); let start = ticks[0].0; ticks.push((self.query_start, start)); - let nodes_s = Series::new(PlSmallStr::from_static("node"), nodes); + let nodes_s = Column::new(PlSmallStr::from_static("node"), nodes); let start: NoNull = ticks .iter() .map(|(start, _)| (start.duration_since(self.query_start)).as_micros() as u64) @@ -57,7 +57,7 @@ impl NodeTimer { let mut end = end.into_inner(); end.rename(PlSmallStr::from_static("end")); - let columns = vec![nodes_s, start.into_series(), end.into_series()]; + let columns = vec![nodes_s, start.into_column(), end.into_column()]; let df = unsafe { DataFrame::new_no_checks(columns) }; df.sort(vec!["start"], SortMultipleOptions::default()) } From e116e8cbdfde6594fb5969b8d2c1c20e06117e1b Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 14:07:21 +0200 Subject: [PATCH 04/42] finish polars-mem-engine --- crates/polars-core/src/frame/mod.rs | 2 +- .../polars-core/src/frame/upstream_traits.rs | 10 +++++ .../polars-io/src/parquet/read/read_impl.rs | 4 +- .../src/executors/group_by.rs | 9 +++- .../src/executors/group_by_dynamic.rs | 7 ++- .../src/executors/group_by_partitioned.rs | 45 ++++++++++++------- .../src/executors/group_by_rolling.rs | 16 +++++-- .../polars-mem-engine/src/executors/join.rs | 4 +- .../src/executors/projection_utils.rs | 3 ++ .../src/executors/scan/python_scan.rs | 2 +- .../src/executors/operators/projection.rs | 5 ++- 11 files changed, 75 insertions(+), 32 deletions(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index bf06ebd3e546..545db5625fa7 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -1262,7 +1262,7 @@ impl DataFrame { Ok(()) } - pub fn _add_columns(&mut self, columns: Vec, schema: &Schema) -> PolarsResult<()> { + pub fn _add_columns(&mut self, columns: Vec, schema: &Schema) -> PolarsResult<()> { for (i, s) in columns.into_iter().enumerate() { // we need to branch here // because users can add multiple columns with the same name diff --git a/crates/polars-core/src/frame/upstream_traits.rs b/crates/polars-core/src/frame/upstream_traits.rs index 11bfc88ef196..38b346ace652 100644 --- a/crates/polars-core/src/frame/upstream_traits.rs +++ b/crates/polars-core/src/frame/upstream_traits.rs @@ -12,6 +12,16 @@ impl FromIterator for DataFrame { } } +impl FromIterator for DataFrame { + /// # Panics + /// + /// Panics if Column have different lengths. + fn from_iter>(iter: T) -> Self { + let v = iter.into_iter().collect(); + DataFrame::new(v).expect("could not create DataFrame from iterator") + } +} + impl Index for DataFrame { type Output = Column; diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index fc528267e6f2..e7b40e02b33f 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -431,9 +431,9 @@ fn rg_to_dfs_prefiltered( debug_assert_eq!(array.len(), filter_mask.set_bits()); - Ok(array) + Ok(array.into_column()) }) - .collect::>>()?; + .collect::>>()?; let mut rearranged_schema = df.schema(); rearranged_schema.merge(Schema::from_arrow_schema(schema.as_ref())); diff --git a/crates/polars-mem-engine/src/executors/group_by.rs b/crates/polars-mem-engine/src/executors/group_by.rs index 8542f9fbb338..230c6e3a475f 100644 --- a/crates/polars-mem-engine/src/executors/group_by.rs +++ b/crates/polars-mem-engine/src/executors/group_by.rs @@ -56,7 +56,7 @@ impl GroupByExec { #[allow(clippy::too_many_arguments)] pub(super) fn group_by_helper( mut df: DataFrame, - keys: Vec, + keys: Vec, aggs: &[Arc], apply: Option>, state: &ExecutionState, @@ -89,6 +89,11 @@ pub(super) fn group_by_helper( rayon::join(get_columns, get_agg) }); let agg_columns = agg_columns?; + // @scalar-opt + let agg_columns = agg_columns + .into_iter() + .map(Column::from) + .collect::>(); columns.extend_from_slice(&agg_columns); DataFrame::new(columns) @@ -99,7 +104,7 @@ impl GroupByExec { let keys = self .keys .iter() - .map(|e| e.evaluate(&df, state)) + .map(|e| e.evaluate(&df, state).map(Column::from)) .collect::>()?; group_by_helper( df, diff --git a/crates/polars-mem-engine/src/executors/group_by_dynamic.rs b/crates/polars-mem-engine/src/executors/group_by_dynamic.rs index 5fe6dca17015..e38ad7d5022c 100644 --- a/crates/polars-mem-engine/src/executors/group_by_dynamic.rs +++ b/crates/polars-mem-engine/src/executors/group_by_dynamic.rs @@ -25,7 +25,7 @@ impl GroupByDynamicExec { let keys = self .keys .iter() - .map(|e| e.evaluate(&df, state)) + .map(|e| e.evaluate(&df, state).map(Column::from)) .collect::>>()?; let (mut time_key, mut keys, groups) = df.group_by_dynamic(keys, &self.options)?; @@ -59,6 +59,11 @@ impl GroupByDynamicExec { } let agg_columns = evaluate_aggs(&df, &self.aggs, groups, state)?; + // @scalar-opt + let agg_columns = agg_columns + .into_iter() + .map(Column::from) + .collect::>(); let mut columns = Vec::with_capacity(agg_columns.len() + 1 + keys.len()); columns.extend_from_slice(&keys); diff --git a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs index ec4a691eb547..d731b1a22db8 100644 --- a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs +++ b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs @@ -48,7 +48,7 @@ impl PartitionGroupByExec { } } - fn keys(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult> { + fn keys(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult> { compute_keys(&self.phys_keys, df, state) } } @@ -57,8 +57,10 @@ fn compute_keys( keys: &[Arc], df: &DataFrame, state: &ExecutionState, -) -> PolarsResult> { - keys.iter().map(|s| s.evaluate(df, state)).collect() +) -> PolarsResult> { + keys.iter() + .map(|s| s.evaluate(df, state).map(Column::from)) + .collect() } fn run_partitions( @@ -67,7 +69,7 @@ fn run_partitions( state: &ExecutionState, n_threads: usize, maintain_order: bool, -) -> PolarsResult<(Vec, Vec>)> { +) -> PolarsResult<(Vec, Vec>)> { // We do a partitioned group_by. // Meaning that we first do the group_by operation arbitrarily // split on several threads. Than the final result we apply the same group_by again. @@ -102,7 +104,8 @@ fn run_partitions( } } else { agg - }) + } + .into_column()) }) .collect::>>()?; @@ -115,7 +118,7 @@ fn run_partitions( }) } -fn estimate_unique_count(keys: &[Series], mut sample_size: usize) -> PolarsResult { +fn estimate_unique_count(keys: &[Column], mut sample_size: usize) -> PolarsResult { // https://stats.stackexchange.com/a/19090/147321 // estimated unique size // u + ui / m (s - m) @@ -144,13 +147,14 @@ fn estimate_unique_count(keys: &[Series], mut sample_size: usize) -> PolarsResul // not that sampling without replacement is very very expensive. don't do that. let s = keys[0].sample_n(sample_size, true, false, None).unwrap(); // fast multi-threaded way to get unique. - let groups = s.group_tuples(true, false)?; + let groups = s.as_materialized_series().group_tuples(true, false)?; Ok(finish(&groups)) } else { let offset = (keys[0].len() / 2) as i64; let keys = keys .iter() .map(|s| s.slice(offset, sample_size)) + .map(Column::from) .collect::>(); let df = unsafe { DataFrame::new_no_checks(keys) }; let names = df.get_column_names().into_iter().cloned(); @@ -168,7 +172,7 @@ const PARTITION_LIMIT: usize = 1000; // Checks if we should run normal or default aggregation // by sampling data. fn can_run_partitioned( - keys: &[Series], + keys: &[Column], original_df: &DataFrame, state: &ExecutionState, from_partitioned_ds: bool, @@ -289,16 +293,17 @@ impl PartitionGroupByExec { // MERGE phase + // @scalar-correctness? let df = accumulate_dataframes_vertical(splitted_dfs)?; - let keys = splitted_keys - .into_iter() - .reduce(|mut acc, e| { - acc.iter_mut().zip(e).for_each(|(acc, e)| { - let _ = acc.append(&e); + let keys: Vec = + splitted_keys + .into_iter() + .fold(Vec::::new(), |mut acc, e| { + acc.iter_mut().zip(e).for_each(|(acc, e)| { + acc.append(&e.into_column()); + }); + acc }); - acc - }) - .unwrap(); // the partitioned group_by has added columns so we must update the schema. state.set_schema(self.output_schema.clone()); @@ -327,7 +332,13 @@ impl PartitionGroupByExec { .zip(&df.get_columns()[self.phys_keys.len()..]) .map(|(expr, partitioned_s)| { let agg_expr = expr.as_partitioned_aggregator().unwrap(); - agg_expr.finalize(partitioned_s.clone(), groups, state) + agg_expr + .finalize( + partitioned_s.as_materialized_series().clone(), + groups, + state, + ) + .map(Column::from) }) .collect(); diff --git a/crates/polars-mem-engine/src/executors/group_by_rolling.rs b/crates/polars-mem-engine/src/executors/group_by_rolling.rs index 810365b25bc6..50ad9da7fef2 100644 --- a/crates/polars-mem-engine/src/executors/group_by_rolling.rs +++ b/crates/polars-mem-engine/src/executors/group_by_rolling.rs @@ -13,7 +13,7 @@ pub(crate) struct GroupByRollingExec { } #[cfg(feature = "dynamic_group_by")] -unsafe fn update_keys(keys: &mut [Series], groups: &GroupsProxy) { +unsafe fn update_keys(keys: &mut [Column], groups: &GroupsProxy) { match groups { GroupsProxy::Idx(groups) => { let first = groups.first(); @@ -21,7 +21,10 @@ unsafe fn update_keys(keys: &mut [Series], groups: &GroupsProxy) { // can be empty, but we still want to know the first value // of that group for key in keys.iter_mut() { - *key = key.take_unchecked_from_slice(first); + *key = key + .as_materialized_series() + .take_unchecked_from_slice(first) + .into_column(); } }, GroupsProxy::Slice { groups, .. } => { @@ -30,7 +33,10 @@ unsafe fn update_keys(keys: &mut [Series], groups: &GroupsProxy) { .iter() .map(|[first, _len]| *first) .collect_ca(PlSmallStr::EMPTY); - *key = key.take_unchecked(&indices); + *key = key + .as_materialized_series() + .take_unchecked(&indices) + .into_column(); } }, } @@ -48,7 +54,7 @@ impl GroupByRollingExec { let keys = self .keys .iter() - .map(|e| e.evaluate(&df, state)) + .map(|e| e.evaluate(&df, state).map(Column::from)) .collect::>>()?; let (mut time_key, mut keys, groups) = df.rolling(keys, &self.options)?; @@ -81,6 +87,8 @@ impl GroupByRollingExec { }; let agg_columns = evaluate_aggs(&df, &self.aggs, groups, state)?; + // @scalar-opt + let agg_columns: Vec = agg_columns.into_iter().map(Column::from).collect(); let mut columns = Vec::with_capacity(agg_columns.len() + 1 + keys.len()); columns.extend_from_slice(&keys); diff --git a/crates/polars-mem-engine/src/executors/join.rs b/crates/polars-mem-engine/src/executors/join.rs index 5edab8551ece..0c6ea2bd5f0a 100644 --- a/crates/polars-mem-engine/src/executors/join.rs +++ b/crates/polars-mem-engine/src/executors/join.rs @@ -88,13 +88,13 @@ impl Executor for JoinExec { let left_on_series = self .left_on .iter() - .map(|e| e.evaluate(&df_left, state)) + .map(|e| e.evaluate(&df_left, state).map(Column::from)) .collect::>>()?; let right_on_series = self .right_on .iter() - .map(|e| e.evaluate(&df_right, state)) + .map(|e| e.evaluate(&df_right, state).map(Column::from)) .collect::>>()?; // prepare the tolerance diff --git a/crates/polars-mem-engine/src/executors/projection_utils.rs b/crates/polars-mem-engine/src/executors/projection_utils.rs index bd0e189f0b14..8287c923969a 100644 --- a/crates/polars-mem-engine/src/executors/projection_utils.rs +++ b/crates/polars-mem-engine/src/executors/projection_utils.rs @@ -334,6 +334,9 @@ pub(super) fn check_expand_literals( .collect::>()? } + // @scalar-opt + let selected_columns = selected_columns.into_iter().map(Column::from).collect(); + let df = unsafe { DataFrame::new_no_checks(selected_columns) }; // a literal could be projected to a zero length dataframe. diff --git a/crates/polars-mem-engine/src/executors/scan/python_scan.rs b/crates/polars-mem-engine/src/executors/scan/python_scan.rs index 270c52ea963c..067895ed593f 100644 --- a/crates/polars-mem-engine/src/executors/scan/python_scan.rs +++ b/crates/polars-mem-engine/src/executors/scan/python_scan.rs @@ -24,7 +24,7 @@ fn python_df_to_rust(py: Python, df: Bound) -> PolarsResult { let (ptr, len, cap) = raw_parts; unsafe { Ok(DataFrame::new_no_checks(Vec::from_raw_parts( - ptr as *mut Series, + ptr as *mut Column, len, cap, ))) diff --git a/crates/polars-pipe/src/executors/operators/projection.rs b/crates/polars-pipe/src/executors/operators/projection.rs index 67141d0c44a7..f948fbe2e4ef 100644 --- a/crates/polars-pipe/src/executors/operators/projection.rs +++ b/crates/polars-pipe/src/executors/operators/projection.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use polars_core::error::PolarsResult; use polars_core::frame::DataFrame; use polars_core::schema::SchemaRef; +use polars_core::frame::column::{Column, IntoColumn}; use polars_plan::prelude::ProjectionOptions; use polars_utils::pl_str::PlSmallStr; @@ -70,7 +71,7 @@ impl Operator for ProjectionOperator { has_literals |= s.len() == 1; has_empty |= s.len() == 0; - Ok(s) + Ok(s.into_column()) }) .collect::>>()?; @@ -117,7 +118,7 @@ impl Operator for HstackOperator { let projected = self .exprs .iter() - .map(|e| e.evaluate(chunk, &context.execution_state)) + .map(|e| e.evaluate(chunk, &context.execution_state).map(Column::from)) .collect::>>()?; let columns = chunk.data.get_columns()[..width].to_vec(); From 8f0595f5de441810c8be816085702290ed04945a Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 14:19:18 +0200 Subject: [PATCH 05/42] finish polars-pipe --- crates/polars-core/src/frame/column.rs | 12 +++++++++++- crates/polars-core/src/frame/mod.rs | 13 +++++++++++++ crates/polars-mem-engine/src/executors/sort.rs | 2 +- .../polars-mem-engine/src/executors/stack.rs | 18 +++++++++++++++++- .../executors/sinks/group_by/generic/global.rs | 11 +++++++++-- .../sinks/group_by/generic/hash_table.rs | 10 +++++++--- .../executors/sinks/group_by/generic/mod.rs | 11 ++++++----- .../src/executors/sinks/group_by/mod.rs | 6 +++--- .../executors/sinks/group_by/primitive/mod.rs | 8 ++++++-- .../src/executors/sinks/group_by/string.rs | 4 ++-- .../sinks/joins/generic_probe_outer.rs | 2 +- .../src/executors/sinks/sort/ooc.rs | 3 ++- .../src/executors/sinks/sort/sink_multiple.rs | 5 +++-- .../polars-pipe/src/executors/sources/csv.rs | 2 +- crates/polars-pipe/src/operators/chunks.rs | 2 +- 15 files changed, 83 insertions(+), 26 deletions(-) diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index 865222bbbda9..8e77c92babdb 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -8,7 +8,7 @@ use polars_utils::pl_str::PlSmallStr; use crate::chunked_array::metadata::MetadataFlags; use crate::prelude::*; -use crate::series::{BitRepr, IsSorted}; +use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] @@ -690,6 +690,16 @@ impl Column { // @scalar-opt self.as_materialized_series().product() } + + pub fn binary_offset(&self) -> PolarsResult<&BinaryOffsetChunked> { + // @scalar-opt + self.as_materialized_series().binary_offset() + } + + pub fn phys_iter(&self) -> SeriesPhysIter<'_> { + // @scalar-opt + self.as_materialized_series().phys_iter() + } } impl Default for Column { diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 545db5625fa7..3de199121323 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -1262,6 +1262,19 @@ impl DataFrame { Ok(()) } + pub fn _add_series(&mut self, series: Vec, schema: &Schema) -> PolarsResult<()> { + for (i, s) in series.into_iter().enumerate() { + // we need to branch here + // because users can add multiple columns with the same name + if i == 0 || schema.get(s.name().as_str()).is_some() { + self.with_column_and_schema(s.into_column(), schema)?; + } else { + self.with_column(s.clone().into_column())?; + } + } + Ok(()) + } + pub fn _add_columns(&mut self, columns: Vec, schema: &Schema) -> PolarsResult<()> { for (i, s) in columns.into_iter().enumerate() { // we need to branch here diff --git a/crates/polars-mem-engine/src/executors/sort.rs b/crates/polars-mem-engine/src/executors/sort.rs index 23374abea7ac..a50e38af2750 100644 --- a/crates/polars-mem-engine/src/executors/sort.rs +++ b/crates/polars-mem-engine/src/executors/sort.rs @@ -25,7 +25,7 @@ impl SortExec { .iter() .enumerate() .map(|(i, e)| { - let mut s = e.evaluate(&df, state)?; + let mut s = e.evaluate(&df, state)?.into_column(); // Polars core will try to set the sorted columns as sorted. // This should only be done with simple col("foo") expressions, // therefore we rename more complex expressions so that diff --git a/crates/polars-mem-engine/src/executors/stack.rs b/crates/polars-mem-engine/src/executors/stack.rs index 440fbdd619ca..fa8240d0a4cc 100644 --- a/crates/polars-mem-engine/src/executors/stack.rs +++ b/crates/polars-mem-engine/src/executors/stack.rs @@ -37,7 +37,7 @@ impl StackExec { self.options.run_parallel, )?; // We don't have to do a broadcast check as cse is not allowed to hit this. - df._add_columns(res, schema)?; + df._add_series(res, schema)?; Ok(df) }); @@ -85,6 +85,22 @@ impl StackExec { c.name(), len, height ); } +======= + if !self.options.should_broadcast { + debug_assert!( + res.iter() + .all(|column| column.name().starts_with("__POLARS_CSER_0x")), + "non-broadcasting hstack should only be used for CSE columns" + ); + // Safety: this case only appears as a result of + // CSE optimization, and the usage there produces + // new, unique column names. It is immediately + // followed by a projection which pulls out the + // possibly mismatching column lengths. + unsafe { df.get_columns_mut().extend(res.into_iter().map(Column::from)) }; + } else { + df._add_series(res, schema)?; +>>>>>>> e774e00d2f (finish polars-pipe) } df._add_columns(res, schema)?; } diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/global.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/global.rs index 4488a6faad82..afa67eb80300 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/global.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/global.rs @@ -131,7 +131,7 @@ impl GlobalTable { hashes: &[u64], chunk_indexes: &[IdxSize], keys: &BinaryArray, - agg_cols: &[Series], + agg_cols: &[Column], ) { debug_assert_eq!(hashes.len(), chunk_indexes.len()); debug_assert_eq!(hashes.len(), keys.len()); @@ -168,7 +168,14 @@ impl GlobalTable { let keys = payload.keys(); let chunk_indexes = payload.chunk_index(); let agg_cols = payload.cols(); - self.process_partition_impl(&mut hash_map, hashes, chunk_indexes, keys, agg_cols); + + // @scalar-opt + let agg_cols = agg_cols + .iter() + .map(|v| v.clone().into_column()) + .collect::>(); + + self.process_partition_impl(&mut hash_map, hashes, chunk_indexes, keys, &agg_cols); } } } diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs index 3e57db331b3e..05947baae209 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs @@ -268,12 +268,16 @@ impl AggHashTable { unsafe { polars_row::decode::decode_rows(&mut key_rows, &fields, &key_dtypes) }; let mut cols = Vec::with_capacity(self.num_keys + self.agg_constructors.len()); + cols.extend(key_columns.into_iter().map(|arr| { + Series::try_from((PlSmallStr::EMPTY, arr)) + .unwrap() + .into_column() + })); cols.extend( - key_columns + agg_builders .into_iter() - .map(|arr| Series::try_from((PlSmallStr::EMPTY, arr)).unwrap()), + .map(|buf| buf.into_series().into_column()), ); - cols.extend(agg_builders.into_iter().map(|buf| buf.into_series())); physical_agg_to_logical(&mut cols, &self.output_schema); unsafe { DataFrame::new_no_checks(cols) } } diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs index 55244679e204..e9fa7ba495cd 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/mod.rs @@ -75,23 +75,24 @@ impl SpillPayload { debug_assert_eq!(self.hashes.len(), self.keys.len()); let hashes = - UInt64Chunked::from_vec(PlSmallStr::from_static(HASH_COL), self.hashes).into_series(); + UInt64Chunked::from_vec(PlSmallStr::from_static(HASH_COL), self.hashes).into_column(); let chunk_idx = - IdxCa::from_vec(PlSmallStr::from_static(INDEX_COL), self.chunk_idx).into_series(); + IdxCa::from_vec(PlSmallStr::from_static(INDEX_COL), self.chunk_idx).into_column(); let keys = BinaryOffsetChunked::with_chunk(PlSmallStr::from_static(KEYS_COL), self.keys) - .into_series(); + .into_column(); let mut cols = Vec::with_capacity(self.aggs.len() + 3); cols.push(hashes); cols.push(chunk_idx); cols.push(keys); - cols.extend(self.aggs); + // @scalar-opt + cols.extend(self.aggs.into_iter().map(Column::from)); unsafe { DataFrame::new_no_checks(cols) } } fn spilled_to_columns( spilled: &DataFrame, - ) -> (&[u64], &[IdxSize], &BinaryArray, &[Series]) { + ) -> (&[u64], &[IdxSize], &BinaryArray, &[Column]) { let cols = spilled.get_columns(); let hashes = cols[0].u64().unwrap(); let hashes = hashes.cont_slice().unwrap(); diff --git a/crates/polars-pipe/src/executors/sinks/group_by/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/mod.rs index 7a999e7e7cc7..b8478dd9eb7e 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/mod.rs @@ -13,7 +13,7 @@ use polars_core::using_string_cache; pub(crate) use primitive::*; pub(crate) use string::*; -pub(super) fn physical_agg_to_logical(cols: &mut [Series], output_schema: &Schema) { +pub(super) fn physical_agg_to_logical(cols: &mut [Column], output_schema: &Schema) { for (s, (name, dtype)) in cols.iter_mut().zip(output_schema.iter()) { if s.name() != name { s.rename(name.clone()); @@ -32,7 +32,7 @@ pub(super) fn physical_agg_to_logical(cols: &mut [Series], output_schema: &Schem matches!(dt, DataType::Enum(_, _)), *ordering, ) - .into_series() + .into_column() } } else { let cats = s.u32().unwrap().clone(); @@ -40,7 +40,7 @@ pub(super) fn physical_agg_to_logical(cols: &mut [Series], output_schema: &Schem // SAFETY, we go from logical to primitive back to logical so the categoricals should still match the global map. *s = unsafe { CategoricalChunked::from_global_indices_unchecked(cats, *ordering) - .into_series() + .into_column() }; } else { // we set the global string cache once we start a streaming pipeline diff --git a/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs b/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs index d20ab9bf2b0d..8442b8a9cd7e 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/primitive/mod.rs @@ -205,8 +205,12 @@ where ); let mut cols = Vec::with_capacity(1 + self.number_of_aggs()); - cols.push(key_builder.finish().into_series()); - cols.extend(buffers.into_iter().map(|buf| buf.into_series())); + cols.push(key_builder.finish().into_series().into_column()); + cols.extend( + buffers + .into_iter() + .map(|buf| buf.into_series().into_column()), + ); physical_agg_to_logical(&mut cols, &self.output_schema); Some(unsafe { DataFrame::new_no_checks(cols) }) }) diff --git a/crates/polars-pipe/src/executors/sinks/group_by/string.rs b/crates/polars-pipe/src/executors/sinks/group_by/string.rs index 0a66255f6e7c..f16ca2e17bc3 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/string.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/string.rs @@ -209,8 +209,8 @@ impl StringGroupbySink { ); let mut cols = Vec::with_capacity(1 + self.number_of_aggs()); - cols.push(key_builder.finish().into_series()); - cols.extend(buffers.into_iter().map(|buf| buf.into_series())); + cols.push(key_builder.finish().into_series().into_column()); + cols.extend(buffers.into_iter().map(|buf| buf.into_series().into_column())); physical_agg_to_logical(&mut cols, &self.output_schema); Some(unsafe { DataFrame::new_no_checks(cols) }) }) diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs index 0157fe660de5..2ab417ad2096 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs @@ -268,7 +268,7 @@ impl GenericFullOuterJoinProbe { right_df .get_columns() .iter() - .map(|s| Series::full_null(s.name().clone(), size, s.dtype())) + .map(|s| Column::full_null(s.name().clone(), size, s.dtype())) .collect(), ) }; diff --git a/crates/polars-pipe/src/executors/sinks/sort/ooc.rs b/crates/polars-pipe/src/executors/sinks/sort/ooc.rs index 64acfa30a5db..1c04f67a34a9 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/ooc.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/ooc.rs @@ -168,7 +168,8 @@ pub(super) fn sort_ooc( let df = read_df(&path)?; let sort_col = &df.get_columns()[idx]; - let assigned_parts = det_partitions(sort_col, &samples, descending); + let assigned_parts = + det_partitions(sort_col.as_materialized_series(), &samples, descending); // partition the dataframe into proper buckets let (iter, unique_assigned_parts) = diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs index 053ccb1f1999..f08609ab9e21 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs @@ -100,7 +100,8 @@ fn finalize_dataframe( let (name, logical_dtype) = schema.get_at_index(sort_idx).unwrap(); assert_eq!(logical_dtype.to_physical(), DataType::from(arr.dtype())); let col = - Series::from_chunks_and_dtype_unchecked(name.clone(), vec![arr], logical_dtype); + Series::from_chunks_and_dtype_unchecked(name.clone(), vec![arr], logical_dtype) + .into_column(); cols.insert(sort_idx, col); } } @@ -205,7 +206,7 @@ impl SortSinkMultiple { for i in self.sort_idx.iter() { let s = &cols[*i]; - let arr = _get_rows_encoded_compat_array(s)?; + let arr = _get_rows_encoded_compat_array(s.as_materialized_series())?; self.sort_column.push(arr); } diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index f3267ac1e90a..323776deb976 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -220,7 +220,7 @@ impl Source for CsvSource { // gets passed contains the column. for s in unsafe { data_chunk.data.get_columns_mut() } { if s.name() == ca.name() { - *s = ca.slice(0, s.len()).into_series(); + *s = ca.slice(0, s.len()).into_column(); break; } } diff --git a/crates/polars-pipe/src/operators/chunks.rs b/crates/polars-pipe/src/operators/chunks.rs index 1c78a32dde80..593936c82b81 100644 --- a/crates/polars-pipe/src/operators/chunks.rs +++ b/crates/polars-pipe/src/operators/chunks.rs @@ -14,7 +14,7 @@ impl DataChunk { #[cfg(debug_assertions)] { for c in data.get_columns() { - assert_eq!(c.chunks().len(), 1); + assert_eq!(c.as_materialized_series().chunks().len(), 1); } } Self { chunk_index, data } From 1d15a851390ddd53bd2c141d6a0100895b3b3096 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 14:24:24 +0200 Subject: [PATCH 06/42] finish polars-stream --- crates/polars-core/src/frame/column.rs | 5 +++++ crates/polars-core/src/scalar/mod.rs | 9 ++++++++- .../src/nodes/parquet_source/row_group_decode.rs | 5 +++-- crates/polars-stream/src/nodes/reduce.rs | 2 +- crates/polars-stream/src/nodes/select.rs | 3 ++- crates/polars-stream/src/physical_plan/lower_expr.rs | 8 +++++--- 6 files changed, 24 insertions(+), 8 deletions(-) diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index 8e77c92babdb..2aa791dfbaf2 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -700,6 +700,11 @@ impl Column { // @scalar-opt self.as_materialized_series().phys_iter() } + + pub unsafe fn get_unchecked(&self, index: usize) -> AnyValue { + // @scalar-opt + self.as_materialized_series().get_unchecked(index) + } } impl Default for Column { diff --git a/crates/polars-core/src/scalar/mod.rs b/crates/polars-core/src/scalar/mod.rs index 3220e3468999..aead770ca37c 100644 --- a/crates/polars-core/src/scalar/mod.rs +++ b/crates/polars-core/src/scalar/mod.rs @@ -5,7 +5,7 @@ use polars_utils::pl_str::PlSmallStr; use serde::{Deserialize, Serialize}; use crate::datatypes::{AnyValue, DataType}; -use crate::prelude::Series; +use crate::prelude::{Column, Series, IntoColumn}; #[derive(Clone, Debug, PartialEq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] @@ -45,6 +45,13 @@ impl Scalar { Series::from_any_values_and_dtype(name, &[self.as_any_value()], &self.dtype, true).unwrap() } + pub fn into_column(self, name: PlSmallStr) -> Column { + // @scalar-opt + Series::from_any_values_and_dtype(name, &[self.as_any_value()], &self.dtype, true) + .unwrap() + .into_column() + } + #[inline(always)] pub fn dtype(&self) -> &DataType { &self.dtype diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs index e0944203cfe6..028595c0bfa9 100644 --- a/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use polars_core::frame::DataFrame; use polars_core::prelude::{ ArrowField, ArrowSchema, BooleanChunked, ChunkFull, IdxCa, StringChunked, + IntColumn, Column, }; use polars_core::series::{IntoSeries, IsSorted, Series}; use polars_core::utils::arrow::bitmap::{Bitmap, MutableBitmap}; @@ -68,7 +69,7 @@ impl RowGroupDecoder { if self.row_index.is_some() { // Add a placeholder so that we don't have to shift the entire vec // later. - out_columns.push(Series::default()); + out_columns.push(Column::default()); } let slice_range = row_group_data @@ -197,7 +198,7 @@ impl RowGroupDecoder { ); ca.set_sorted_flag(IsSorted::Ascending); - Ok(Some(ca.into_series())) + Ok(Some(ca.into_column())) } else { Ok(None) } diff --git a/crates/polars-stream/src/nodes/reduce.rs b/crates/polars-stream/src/nodes/reduce.rs index f6de3bd1124a..15048daba4f8 100644 --- a/crates/polars-stream/src/nodes/reduce.rs +++ b/crates/polars-stream/src/nodes/reduce.rs @@ -120,7 +120,7 @@ impl ComputeNode for ReduceNode { .map(|(r, field)| { r.finalize().map(|scalar| { scalar - .into_series(field.name.clone()) + .into_column(field.name.clone()) .cast(&field.dtype) .unwrap() }) diff --git a/crates/polars-stream/src/nodes/select.rs b/crates/polars-stream/src/nodes/select.rs index 688580e10319..3b060e78e654 100644 --- a/crates/polars-stream/src/nodes/select.rs +++ b/crates/polars-stream/src/nodes/select.rs @@ -1,5 +1,6 @@ use std::sync::Arc; +use polars_core::prelude::IntoColumn; use polars_core::schema::Schema; use super::compute_node_prelude::*; @@ -52,7 +53,7 @@ impl ComputeNode for SelectNode { let mut selected = Vec::new(); for selector in slf.selectors.iter() { let s = selector.evaluate(&df, state).await?; - selected.push(s); + selected.push(s.into_column()); } let ret = if slf.extend_original { diff --git a/crates/polars-stream/src/physical_plan/lower_expr.rs b/crates/polars-stream/src/physical_plan/lower_expr.rs index af0e138ec30c..6c4126a9e779 100644 --- a/crates/polars-stream/src/physical_plan/lower_expr.rs +++ b/crates/polars-stream/src/physical_plan/lower_expr.rs @@ -2,7 +2,7 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use polars_core::frame::DataFrame; -use polars_core::prelude::{Field, InitHashMaps, PlHashMap, PlHashSet}; +use polars_core::prelude::{Column, Field, InitHashMaps, PlHashMap, PlHashSet}; use polars_core::schema::{Schema, SchemaExt}; use polars_error::PolarsResult; use polars_expr::planner::get_expr_depth_limit; @@ -238,7 +238,9 @@ fn build_input_independent_node_with_ctx( let phys_expr = create_physical_expr(expr, Context::Default, ctx.expr_arena, None, &mut state)?; - phys_expr.evaluate(&empty, &execution_state) + phys_expr + .evaluate(&empty, &execution_state) + .map(Column::from) }) .try_collect_vec()?; @@ -352,7 +354,7 @@ fn build_fallback_node_with_ctx( let exec_state = ExecutionState::new(); let columns = phys_exprs .iter() - .map(|phys_expr| phys_expr.evaluate(&df, &exec_state)) + .map(|phys_expr| phys_expr.evaluate(&df, &exec_state).map(Column::from)) .try_collect()?; DataFrame::new_with_broadcast(columns) }; From 43fb45e726c20a9fdc40867a0abfd01c1d645b47 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 14:28:58 +0200 Subject: [PATCH 07/42] finish polars-lazy --- crates/polars-lazy/src/dsl/eval.rs | 36 +++++++++++++++--------------- crates/polars-lazy/src/dsl/list.rs | 30 ++++++++++++------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/crates/polars-lazy/src/dsl/eval.rs b/crates/polars-lazy/src/dsl/eval.rs index 574c2b336407..62d4a446b7f7 100644 --- a/crates/polars-lazy/src/dsl/eval.rs +++ b/crates/polars-lazy/src/dsl/eval.rs @@ -45,12 +45,12 @@ pub trait ExprEvalExtension: IntoExpr + Sized { fn cumulative_eval(self, expr: Expr, min_periods: usize, parallel: bool) -> Expr { let this = self.into_expr(); let expr2 = expr.clone(); - let func = move |mut s: Series| { - let name = s.name().clone(); - s.rename(PlSmallStr::EMPTY); + let func = move |mut c: Column| { + let name = c.name().clone(); + c.rename(PlSmallStr::EMPTY); // Ensure we get the new schema. - let output_field = eval_field_to_dtype(s.field().as_ref(), &expr, false); + let output_field = eval_field_to_dtype(c.field().as_ref(), &expr, false); let expr = expr.clone(); let mut arena = Arena::with_capacity(10); @@ -65,7 +65,7 @@ pub trait ExprEvalExtension: IntoExpr + Sized { let state = ExecutionState::new(); - let finish = |out: Series| { + let finish = |out: Column| { polars_ensure!( out.len() <= 1, ComputeError: @@ -76,13 +76,13 @@ pub trait ExprEvalExtension: IntoExpr + Sized { }; let avs = if parallel { - (1..s.len() + 1) + (1..c.len() + 1) .into_par_iter() .map(|len| { - let s = s.slice(0, len); + let s = c.slice(0, len); if (len - s.null_count()) >= min_periods { - let df = s.into_frame(); - let out = phys_expr.evaluate(&df, &state)?; + let df = c.into_frame(); + let out = phys_expr.evaluate(&df, &state)?.into_column(); finish(out) } else { Ok(AnyValue::Null) @@ -91,13 +91,13 @@ pub trait ExprEvalExtension: IntoExpr + Sized { .collect::>>()? } else { let mut df_container = DataFrame::empty(); - (1..s.len() + 1) + (1..c.len() + 1) .map(|len| { - let s = s.slice(0, len); - if (len - s.null_count()) >= min_periods { + let c = c.slice(0, len); + if (len - c.null_count()) >= min_periods { unsafe { - df_container.get_columns_mut().push(s); - let out = phys_expr.evaluate(&df_container, &state)?; + df_container.get_columns_mut().push(c.into_column()); + let out = phys_expr.evaluate(&df_container, &state)?.into_column(); df_container.get_columns_mut().clear(); finish(out) } @@ -107,12 +107,12 @@ pub trait ExprEvalExtension: IntoExpr + Sized { }) .collect::>>()? }; - let s = Series::new(name, avs); + let c = Column::new(name, avs); - if s.dtype() != output_field.dtype() { - s.cast(output_field.dtype()).map(Some) + if c.dtype() != output_field.dtype() { + c.cast(output_field.dtype()).map(Some) } else { - Ok(Some(s)) + Ok(Some(c)) } }; diff --git a/crates/polars-lazy/src/dsl/list.rs b/crates/polars-lazy/src/dsl/list.rs index fb1594196e41..4dae2529bc14 100644 --- a/crates/polars-lazy/src/dsl/list.rs +++ b/crates/polars-lazy/src/dsl/list.rs @@ -44,12 +44,12 @@ fn offsets_to_groups(offsets: &[i64]) -> Option { } fn run_per_sublist( - s: Series, + s: Column, lst: &ListChunked, expr: &Expr, parallel: bool, output_field: Field, -) -> PolarsResult> { +) -> PolarsResult> { let phys_expr = prepare_expression_for_context( PlSmallStr::EMPTY, expr, @@ -86,7 +86,7 @@ fn run_per_sublist( lst.into_iter() .map(|s| { s.and_then(|s| unsafe { - df_container.get_columns_mut().push(s); + df_container.get_columns_mut().push(s.into_column()); let out = phys_expr.evaluate(&df_container, &state); df_container.get_columns_mut().clear(); match out { @@ -107,9 +107,9 @@ fn run_per_sublist( ca.rename(s.name().clone()); if ca.dtype() != output_field.dtype() { - ca.cast(output_field.dtype()).map(Some) + ca.cast(output_field.dtype()).map(Column::from).map(Some) } else { - Ok(Some(ca.into_series())) + Ok(Some(ca.into_column())) } } @@ -117,7 +117,7 @@ fn run_on_group_by_engine( name: PlSmallStr, lst: &ListChunked, expr: &Expr, -) -> PolarsResult> { +) -> PolarsResult> { let lst = lst.rechunk(); let arr = lst.downcast_iter().next().unwrap(); let groups = offsets_to_groups(arr.offsets()).unwrap(); @@ -142,7 +142,7 @@ fn run_on_group_by_engine( }, _ => ac.aggregated(), }; - Ok(Some(out.with_name(name))) + Ok(Some(out.with_name(name).into_column())) } pub trait ListNameSpaceExtension: IntoListNameSpace + Sized { @@ -151,7 +151,7 @@ pub trait ListNameSpaceExtension: IntoListNameSpace + Sized { let this = self.into_list_name_space(); let expr2 = expr.clone(); - let func = move |s: Series| { + let func = move |c: Column| { for e in expr.into_iter() { match e { #[cfg(feature = "dtype-categorical")] @@ -173,19 +173,19 @@ pub trait ListNameSpaceExtension: IntoListNameSpace + Sized { _ => {}, } } - let lst = s.list()?.clone(); + let lst = c.list()?.clone(); // # fast returns // ensure we get the new schema let output_field = eval_field_to_dtype(lst.ref_field(), &expr, true); if lst.is_empty() { - return Ok(Some(Series::new_empty( - s.name().clone(), + return Ok(Some(Column::new_empty( + c.name().clone(), output_field.dtype(), ))); } if lst.null_count() == lst.len() { - return Ok(Some(s.cast(output_field.dtype())?)); + return Ok(Some(c.cast(output_field.dtype())?.into_column())); } let fits_idx_size = lst.get_values_size() <= (IdxSize::MAX as usize); @@ -195,10 +195,10 @@ pub trait ListNameSpaceExtension: IntoListNameSpace + Sized { expr.into_iter().any(|e| matches!(e, Expr::AnonymousFunction { options, .. } if options.fmt_str == MAP_LIST_NAME)) }; - if fits_idx_size && s.null_count() == 0 && !is_user_apply() { - run_on_group_by_engine(s.name().clone(), &lst, &expr) + if fits_idx_size && c.null_count() == 0 && !is_user_apply() { + run_on_group_by_engine(c.name().clone(), &lst, &expr) } else { - run_per_sublist(s, &lst, &expr, parallel, output_field) + run_per_sublist(c, &lst, &expr, parallel, output_field) } }; From d58f393228643cc24b7f5f40706532fa2683a46a Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 14:30:09 +0200 Subject: [PATCH 08/42] finish polars-sql --- crates/polars-sql/src/context.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index 23ffb25070fa..c5d9f4a371b3 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -471,7 +471,8 @@ impl SQLContext { let plan = plan .split('\n') .collect::() - .with_name(PlSmallStr::from_static("Logical Plan")); + .with_name(PlSmallStr::from_static("Logical Plan")) + .into_column(); let df = DataFrame::new(vec![plan])?; Ok(df.lazy()) }, @@ -481,7 +482,7 @@ impl SQLContext { // SHOW TABLES fn execute_show_tables(&mut self, _: &Statement) -> PolarsResult { - let tables = Series::new("name".into(), self.get_tables()); + let tables = Column::new("name".into(), self.get_tables()); let df = DataFrame::new(vec![tables])?; Ok(df.lazy()) } @@ -1031,7 +1032,7 @@ impl SQLContext { "UNNEST table alias requires {} column name{}, found {}", column_values.len(), plural, column_names.len() ); } - let column_series: Vec = column_values + let column_series: Vec = column_values .into_iter() .zip(column_names) .map(|(s, name)| { @@ -1041,6 +1042,7 @@ impl SQLContext { s.clone() } }) + .map(Column::from) .collect(); let lf = DataFrame::new(column_series)?.lazy(); From 563d5fcf93c41e86d1f0bd54ad3cfa6535087ec4 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 14:58:31 +0200 Subject: [PATCH 09/42] finished polars-python --- crates/polars-core/src/frame/column.rs | 19 ++++++++++ crates/polars-core/src/frame/mod.rs | 4 +-- .../src/executors/group_by_partitioned.rs | 2 +- .../src/dsl/function_expr/range/utils.rs | 2 +- crates/polars-plan/src/dsl/python_udf.rs | 5 ++- crates/polars-python/src/dataframe/general.rs | 11 ++++-- crates/polars-python/src/expr/datetime.rs | 5 +-- crates/polars-python/src/functions/lazy.rs | 36 ++++++++++++++++--- .../polars-python/src/interop/arrow/to_py.rs | 6 +++- .../src/interop/arrow/to_rust.rs | 8 +++-- .../src/interop/numpy/to_numpy_df.rs | 6 ++-- crates/polars-python/src/map/dataframe.rs | 10 ++++-- crates/polars-python/src/map/lazy.rs | 14 ++++---- crates/polars-python/src/on_startup.rs | 8 ++--- crates/polars-python/src/series/general.rs | 2 +- crates/polars-python/src/series/mod.rs | 13 ++++++- 16 files changed, 114 insertions(+), 37 deletions(-) diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index 2aa791dfbaf2..beb8b4058d50 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -7,6 +7,7 @@ use polars_error::PolarsResult; use polars_utils::pl_str::PlSmallStr; use crate::chunked_array::metadata::MetadataFlags; +use crate::chunked_array::object::PolarsObjectSafe; use crate::prelude::*; use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; @@ -75,6 +76,14 @@ impl Column { } } + #[inline] + pub fn take_materialized_series(self) -> Series { + match self { + Column::Series(s) => s, + Column::Scalar(s) => s.take_materialized_series(), + } + } + #[inline] pub fn dtype(&self) -> &DataType { // @scalar-opt @@ -705,6 +714,10 @@ impl Column { // @scalar-opt self.as_materialized_series().get_unchecked(index) } + + pub fn get_object(&self, index: usize) -> Option<&dyn PolarsObjectSafe> { + self.as_materialized_series().get_object(index) + } } impl Default for Column { @@ -926,6 +939,12 @@ impl ScalarColumn { self.materialized.get_or_init(|| self.to_series()) } + pub fn take_materialized_series(self) -> Series { + self.materialized + .into_inner() + .unwrap_or_else(|| Self::_to_series(self.name, self.value, self.length)) + } + pub fn select_chunk(&self, _: usize) -> Series { // @scalar-opt // @scalar-correctness? diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 3de199121323..c245484089d4 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -174,7 +174,7 @@ pub struct DataFrame { } impl DataFrame { - pub fn materialized_column_iter(&self) -> impl Iterator { + pub fn materialized_column_iter(&self) -> impl Iterator + ExactSizeIterator { self.columns.iter().map(Column::as_materialized_series) } @@ -630,7 +630,7 @@ impl DataFrame { /// assert_eq!(iterator.next(), None); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn iter(&self) -> impl Iterator { + pub fn iter(&self) -> impl Iterator + ExactSizeIterator { self.materialized_column_iter() } diff --git a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs index d731b1a22db8..3a5626fb303f 100644 --- a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs +++ b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs @@ -300,7 +300,7 @@ impl PartitionGroupByExec { .into_iter() .fold(Vec::::new(), |mut acc, e| { acc.iter_mut().zip(e).for_each(|(acc, e)| { - acc.append(&e.into_column()); + _ = acc.append(&e.into_column()); }); acc }); diff --git a/crates/polars-plan/src/dsl/function_expr/range/utils.rs b/crates/polars-plan/src/dsl/function_expr/range/utils.rs index 232a9d1bb37e..c6f91c49b02b 100644 --- a/crates/polars-plan/src/dsl/function_expr/range/utils.rs +++ b/crates/polars-plan/src/dsl/function_expr/range/utils.rs @@ -1,6 +1,6 @@ use polars_core::prelude::{ polars_bail, polars_ensure, ChunkedArray, Column, Int64Chunked, IntoColumn, ListBuilderTrait, - ListPrimitiveChunkedBuilder, PolarsIntegerType, PolarsResult, Series, + ListPrimitiveChunkedBuilder, PolarsIntegerType, PolarsResult, }; pub(super) fn temporal_series_to_i64_scalar(s: &Column) -> Option { diff --git a/crates/polars-plan/src/dsl/python_udf.rs b/crates/polars-plan/src/dsl/python_udf.rs index cdd8194793e6..0fb786db493d 100644 --- a/crates/polars-plan/src/dsl/python_udf.rs +++ b/crates/polars-plan/src/dsl/python_udf.rs @@ -5,7 +5,6 @@ use polars_core::datatypes::{DataType, Field}; use polars_core::error::*; use polars_core::frame::DataFrame; use polars_core::frame::column::Column; -use polars_core::prelude::Series; use polars_core::schema::Schema; use pyo3::prelude::*; use pyo3::pybacked::PyBackedBytes; @@ -20,7 +19,7 @@ use crate::constants::MAP_LIST_NAME; use crate::prelude::*; // Will be overwritten on Python Polars start up. -pub static mut CALL_SERIES_UDF_PYTHON: Option< +pub static mut CALL_COLUMNS_UDF_PYTHON: Option< fn(s: Column, lambda: &PyObject) -> PolarsResult, > = None; pub static mut CALL_DF_UDF_PYTHON: Option< @@ -166,7 +165,7 @@ impl DataFrameUdf for PythonFunction { impl ColumnsUdf for PythonUdfExpression { fn call_udf(&self, s: &mut [Column]) -> PolarsResult> { - let func = unsafe { CALL_SERIES_UDF_PYTHON.unwrap() }; + let func = unsafe { CALL_COLUMNS_UDF_PYTHON.unwrap() }; let output_type = self .output_type diff --git a/crates/polars-python/src/dataframe/general.rs b/crates/polars-python/src/dataframe/general.rs index 043564b20c99..ff635c08898b 100644 --- a/crates/polars-python/src/dataframe/general.rs +++ b/crates/polars-python/src/dataframe/general.rs @@ -27,6 +27,8 @@ impl PyDataFrame { #[new] pub fn __init__(columns: Vec) -> PyResult { let columns = columns.to_series(); + // @scalar-opt + let columns = columns.into_iter().map(|s| s.into()).collect(); let df = DataFrame::new(columns).map_err(PyPolarsErr::from)?; Ok(PyDataFrame::new(df)) } @@ -181,12 +183,16 @@ impl PyDataFrame { pub fn hstack(&self, columns: Vec) -> PyResult { let columns = columns.to_series(); + // @scalar-opt + let columns = columns.into_iter().map(Into::into).collect::>(); let df = self.df.hstack(&columns).map_err(PyPolarsErr::from)?; Ok(df.into()) } pub fn hstack_mut(&mut self, columns: Vec) -> PyResult<()> { let columns = columns.to_series(); + // @scalar-opt + let columns = columns.into_iter().map(Into::into).collect::>(); self.df.hstack_mut(&columns).map_err(PyPolarsErr::from)?; Ok(()) } @@ -208,6 +214,7 @@ impl PyDataFrame { pub fn drop_in_place(&mut self, name: &str) -> PyResult { let s = self.df.drop_in_place(name).map_err(PyPolarsErr::from)?; + let s = s.take_materialized_series(); Ok(PySeries { series: s }) } @@ -222,7 +229,7 @@ impl PyDataFrame { let s = index_adjusted.and_then(|i| df.select_at_idx(i)); match s { - Some(s) => Ok(PySeries::new(s.clone())), + Some(s) => Ok(PySeries::new(s.as_materialized_series().clone())), None => Err(PyIndexError::new_err( polars_err!(oob = index, df.width()).to_string(), )), @@ -240,7 +247,7 @@ impl PyDataFrame { let series = self .df .column(name) - .map(|s| PySeries::new(s.clone())) + .map(|s| PySeries::new(s.as_materialized_series().clone())) .map_err(PyPolarsErr::from)?; Ok(series) } diff --git a/crates/polars-python/src/expr/datetime.rs b/crates/polars-python/src/expr/datetime.rs index 69325b03a19f..31052e6189d4 100644 --- a/crates/polars-python/src/expr/datetime.rs +++ b/crates/polars-python/src/expr/datetime.rs @@ -33,8 +33,9 @@ impl PyExpr { .clone() .map( |s| { - s.timestamp(TimeUnit::Milliseconds) - .map(|ca| Some((ca / 1000).into_series())) + s.take_materialized_series() + .timestamp(TimeUnit::Milliseconds) + .map(|ca| Some((ca / 1000).into_column())) }, GetOutput::from_type(DataType::Int64), ) diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index 2d39bcdbdc09..d649b7be4cba 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -225,7 +225,14 @@ pub fn arctan2(y: PyExpr, x: PyExpr) -> PyExpr { pub fn cum_fold(acc: PyExpr, lambda: PyObject, exprs: Vec, include_init: bool) -> PyExpr { let exprs = exprs.to_exprs(); - let func = move |a: Series, b: Series| binary_lambda(&lambda, a, b); + let func = move |a: Column, b: Column| { + binary_lambda( + &lambda, + a.take_materialized_series(), + b.take_materialized_series(), + ) + .map(|v| v.map(Column::from)) + }; dsl::cum_fold_exprs(acc.inner, func, exprs, include_init).into() } @@ -233,7 +240,14 @@ pub fn cum_fold(acc: PyExpr, lambda: PyObject, exprs: Vec, include_init: pub fn cum_reduce(lambda: PyObject, exprs: Vec) -> PyExpr { let exprs = exprs.to_exprs(); - let func = move |a: Series, b: Series| binary_lambda(&lambda, a, b); + let func = move |a: Column, b: Column| { + binary_lambda( + &lambda, + a.take_materialized_series(), + b.take_materialized_series(), + ) + .map(|v| v.map(Column::from)) + }; dsl::cum_reduce_exprs(func, exprs).into() } @@ -394,7 +408,14 @@ pub fn first() -> PyExpr { pub fn fold(acc: PyExpr, lambda: PyObject, exprs: Vec) -> PyExpr { let exprs = exprs.to_exprs(); - let func = move |a: Series, b: Series| binary_lambda(&lambda, a, b); + let func = move |a: Column, b: Column| { + binary_lambda( + &lambda, + a.take_materialized_series(), + b.take_materialized_series(), + ) + .map(|v| v.map(Column::from)) + }; dsl::fold_exprs(acc.inner, func, exprs).into() } @@ -495,7 +516,14 @@ pub fn pearson_corr(a: PyExpr, b: PyExpr, ddof: u8) -> PyExpr { pub fn reduce(lambda: PyObject, exprs: Vec) -> PyExpr { let exprs = exprs.to_exprs(); - let func = move |a: Series, b: Series| binary_lambda(&lambda, a, b); + let func = move |a: Column, b: Column| { + binary_lambda( + &lambda, + a.take_materialized_series(), + b.take_materialized_series(), + ) + .map(|v| v.map(Column::from)) + }; dsl::reduce_exprs(func, exprs).into() } diff --git a/crates/polars-python/src/interop/arrow/to_py.rs b/crates/polars-python/src/interop/arrow/to_py.rs index de6c07ef31c9..017771bb1567 100644 --- a/crates/polars-python/src/interop/arrow/to_py.rs +++ b/crates/polars-python/src/interop/arrow/to_py.rs @@ -95,7 +95,11 @@ impl DataFrameStreamIterator { let dtype = ArrowDataType::Struct(schema.into_iter_values().collect()); Self { - columns: df.get_columns().to_vec(), + columns: df + .get_columns() + .iter() + .map(|v| v.as_materialized_series().clone()) + .collect(), dtype, idx: 0, n_chunks: df.n_chunks(), diff --git a/crates/polars-python/src/interop/arrow/to_rust.rs b/crates/polars-python/src/interop/arrow/to_rust.rs index 8d76f53b243a..809bd527a492 100644 --- a/crates/polars-python/src/interop/arrow/to_rust.rs +++ b/crates/polars-python/src/interop/arrow/to_rust.rs @@ -85,7 +85,8 @@ pub fn to_rust_df(rb: &[Bound]) -> PyResult { .enumerate() .map(|(i, arr)| { let s = Series::try_from((names[i].clone(), arr)) - .map_err(PyPolarsErr::from)?; + .map_err(PyPolarsErr::from)? + .into_column(); Ok(s) }) .collect::>>() @@ -95,8 +96,9 @@ pub fn to_rust_df(rb: &[Bound]) -> PyResult { .into_iter() .enumerate() .map(|(i, arr)| { - let s = - Series::try_from((names[i].clone(), arr)).map_err(PyPolarsErr::from)?; + let s = Series::try_from((names[i].clone(), arr)) + .map_err(PyPolarsErr::from)? + .into_column(); Ok(s) }) .collect::>>() diff --git a/crates/polars-python/src/interop/numpy/to_numpy_df.rs b/crates/polars-python/src/interop/numpy/to_numpy_df.rs index 2718203d46f3..b249970c438d 100644 --- a/crates/polars-python/src/interop/numpy/to_numpy_df.rs +++ b/crates/polars-python/src/interop/numpy/to_numpy_df.rs @@ -113,7 +113,7 @@ fn check_df_dtypes_support_view(df: &DataFrame) -> Option<&DataType> { fn check_df_columns_contiguous(df: &DataFrame) -> bool { let columns = df.get_columns(); - if columns.iter().any(|s| s.n_chunks() > 1) { + if columns.iter().any(|s| s.as_materialized_series().n_chunks() > 1) { return false; } if columns.len() <= 1 { @@ -126,7 +126,7 @@ fn check_df_columns_contiguous(df: &DataFrame) -> bool { let slices = columns .iter() .map(|s| { - let ca: &ChunkedArray<$T> = s.unpack().unwrap(); + let ca: &ChunkedArray<$T> = s.as_materialized_series().unpack().unwrap(); ca.data_views().next().unwrap() }) .collect::>(); @@ -174,7 +174,7 @@ where T: PolarsNumericType, T::Native: Element, { - let ca: &ChunkedArray = df.get_columns().first().unwrap().unpack().unwrap(); + let ca: &ChunkedArray = df.get_columns().first().unwrap().as_materialized_series().unpack().unwrap(); let first_slice = ca.data_views().next().unwrap(); let start_ptr = first_slice.as_ptr(); diff --git a/crates/polars-python/src/map/dataframe.rs b/crates/polars-python/src/map/dataframe.rs index 5be2216b0898..1fbfe5d9232a 100644 --- a/crates/polars-python/src/map/dataframe.rs +++ b/crates/polars-python/src/map/dataframe.rs @@ -10,12 +10,18 @@ use crate::PyDataFrame; /// Create iterators for all the Series in the DataFrame. fn get_iters(df: &DataFrame) -> Vec { - df.get_columns().iter().map(|s| s.iter()).collect() + df.get_columns() + .iter() + .map(|s| s.as_materialized_series().iter()) + .collect() } /// Create iterators for all the Series in the DataFrame, skipping the first `n` rows. fn get_iters_skip(df: &DataFrame, n: usize) -> Vec> { - df.get_columns().iter().map(|s| s.iter().skip(n)).collect() + df.get_columns() + .iter() + .map(|s| s.as_materialized_series().iter().skip(n)) + .collect() } // the return type is Union[PySeries, PyDataFrame] and a boolean indicating if it is a dataframe or not diff --git a/crates/polars-python/src/map/lazy.rs b/crates/polars-python/src/map/lazy.rs index f7edcbe3facb..c1a680056774 100644 --- a/crates/polars-python/src/map/lazy.rs +++ b/crates/polars-python/src/map/lazy.rs @@ -112,7 +112,7 @@ pub(crate) fn binary_lambda( .collect()?; let s = out.select_at_idx(0).unwrap().clone(); - PySeries::new(s) + PySeries::new(s.take_materialized_series()) } else { return Some(result_series_wrapper.to_series(py, &pypolars.into_py(py), "")) .transpose(); @@ -138,9 +138,9 @@ pub fn map_single( pyexpr.inner.clone().map_python(func, agg_list).into() } -pub(crate) fn call_lambda_with_series_slice( +pub(crate) fn call_lambda_with_columns_slice( py: Python, - s: &[Series], + s: &[Column], lambda: &PyObject, polars_module: &PyObject, ) -> PyObject { @@ -148,7 +148,7 @@ pub(crate) fn call_lambda_with_series_slice( // create a PySeries struct/object for Python let iter = s.iter().map(|s| { - let ps = PySeries::new(s.clone()); + let ps = PySeries::new(s.as_materialized_series().clone()); // Wrap this PySeries object in the python side Series wrapper let python_series_wrapper = pypolars.getattr("wrap_s").unwrap().call1((ps,)).unwrap(); @@ -176,17 +176,17 @@ pub fn map_mul( // do the import outside of the function to prevent import side effects in a hot loop. let pypolars = PyModule::import_bound(py, "polars").unwrap().to_object(py); - let function = move |s: &mut [Series]| { + let function = move |s: &mut [Column]| { Python::with_gil(|py| { // this is a python Series - let out = call_lambda_with_series_slice(py, s, &lambda, &pypolars); + let out = call_lambda_with_columns_slice(py, s, &lambda, &pypolars); // we return an error, because that will become a null value polars lazy apply list if map_groups && out.is_none(py) { return Ok(None); } - Ok(Some(out.to_series(py, &pypolars, "")?)) + Ok(Some(out.to_series(py, &pypolars, "")?.into_column())) }) }; diff --git a/crates/polars-python/src/on_startup.rs b/crates/polars-python/src/on_startup.rs index 3f08f71740b5..9b6f17d46f72 100644 --- a/crates/polars-python/src/on_startup.rs +++ b/crates/polars-python/src/on_startup.rs @@ -15,11 +15,11 @@ use crate::prelude::ObjectValue; use crate::py_modules::{POLARS, UTILS}; use crate::Wrap; -fn python_function_caller_series(s: Series, lambda: &PyObject) -> PolarsResult { +fn python_function_caller_series(s: Column, lambda: &PyObject) -> PolarsResult { Python::with_gil(|py| { - let object = call_lambda_with_series(py, s.clone(), lambda) + let object = call_lambda_with_series(py, s.clone().take_materialized_series(), lambda) .map_err(|s| ComputeError(format!("{}", s).into()))?; - object.to_series(py, &POLARS, s.name()) + object.to_series(py, &POLARS, s.name()).map(Column::from) }) } @@ -92,7 +92,7 @@ pub fn register_startup_deps() { let physical_dtype = ArrowDataType::FixedSizeBinary(object_size); registry::register_object_builder(object_builder, object_converter, physical_dtype); // register SERIES UDF - unsafe { python_udf::CALL_SERIES_UDF_PYTHON = Some(python_function_caller_series) } + unsafe { python_udf::CALL_COLUMNS_UDF_PYTHON = Some(python_function_caller_series) } // register DATAFRAME UDF unsafe { python_udf::CALL_DF_UDF_PYTHON = Some(python_function_caller_df) } // register warning function for `polars_warn!` diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs index 359f39df6291..e0563a9c327d 100644 --- a/crates/polars-python/src/series/general.rs +++ b/crates/polars-python/src/series/general.rs @@ -404,7 +404,7 @@ impl PySeries { df.pop() .map(|s| { - self.series = s; + self.series = s.take_materialized_series(); }) .ok_or_else(|| { PyPolarsErr::from(PolarsError::NoData( diff --git a/crates/polars-python/src/series/mod.rs b/crates/polars-python/src/series/mod.rs index 1b4542b06c5a..0c1ecbc40b1c 100644 --- a/crates/polars-python/src/series/mod.rs +++ b/crates/polars-python/src/series/mod.rs @@ -23,7 +23,7 @@ mod numpy_ufunc; #[cfg(feature = "pymethods")] mod scatter; -use polars::prelude::Series; +use polars::prelude::{Column, Series}; use pyo3::pyclass; #[pyclass] @@ -66,3 +66,14 @@ impl ToPySeries for Vec { unsafe { std::mem::transmute(self) } } } + +impl ToPySeries for Vec { + fn to_pyseries(self) -> Vec { + // @scalar-opt + let series: Vec = self + .into_iter() + .map(|c| c.take_materialized_series()) + .collect(); + series.to_pyseries() + } +} From 12ab77c0d9bba87553e91bd7252287503a684f30 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 15:24:54 +0200 Subject: [PATCH 10/42] Tiny fixes to make the tests pass again --- crates/polars-core/src/frame/column.rs | 29 +++++++++++++++++-- .../src/executors/group_by_partitioned.rs | 19 ++++++------ py-polars/tests/unit/dataframe/test_serde.py | 1 + 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index beb8b4058d50..fd845c518d06 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -13,18 +13,17 @@ use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] +#[cfg_attr(feature = "serde", serde(from = "Series"))] +#[cfg_attr(feature = "serde", serde(into = "_SerdeSeries"))] pub enum Column { Series(Series), Scalar(ScalarColumn), } #[derive(Debug, Clone)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] pub struct ScalarColumn { name: PlSmallStr, value: AnyValue<'static>, - // invariant: Series.len() == length - #[cfg_attr(feature = "serde", serde(skip))] materialized: OnceLock, length: usize, } @@ -970,3 +969,27 @@ impl IntoColumn for Column { self } } + +/// We don't want to serialize the scalar columns. So this helps pretend that columns are always +/// initialized without implementing From for Series. +/// +/// Those casts should be explicit. +#[derive(Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize))] +#[cfg_attr(feature = "serde", serde(into = "Series"))] +struct _SerdeSeries(Series); + +impl From for _SerdeSeries { + #[inline] + fn from(value: Column) -> Self { + Self(value.take_materialized_series()) + } +} + +impl Into for _SerdeSeries { + #[inline] + fn into(self) -> Series { + self.0 + } +} + diff --git a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs index 3a5626fb303f..658b259f0809 100644 --- a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs +++ b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs @@ -293,17 +293,18 @@ impl PartitionGroupByExec { // MERGE phase - // @scalar-correctness? let df = accumulate_dataframes_vertical(splitted_dfs)?; - let keys: Vec = - splitted_keys - .into_iter() - .fold(Vec::::new(), |mut acc, e| { - acc.iter_mut().zip(e).for_each(|(acc, e)| { - _ = acc.append(&e.into_column()); - }); - acc + let keys = splitted_keys + .into_iter() + .reduce(|mut acc, e| { + acc.iter_mut().zip(e).for_each(|(acc, e)| { + let _ = acc.append(&e); }); + acc + }) + .unwrap(); + // @scalar-opt + let keys = keys.into_iter().map(Column::from).collect(); // the partitioned group_by has added columns so we must update the schema. state.set_schema(self.output_schema.clone()); diff --git a/py-polars/tests/unit/dataframe/test_serde.py b/py-polars/tests/unit/dataframe/test_serde.py index 29d4eb5b05a6..71936c9eae81 100644 --- a/py-polars/tests/unit/dataframe/test_serde.py +++ b/py-polars/tests/unit/dataframe/test_serde.py @@ -65,6 +65,7 @@ def test_df_serialize_json() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [9, 5, 6]}).sort("a") result = df.serialize(format="json") expected = '{"columns":[{"name":"a","datatype":"Int64","bit_settings":"SORTED_ASC","values":[1,2,3]},{"name":"b","datatype":"Int64","bit_settings":"","values":[9,5,6]}]}' + print(result) assert result == expected From 977e9d0432d6bf88cc54de03f8e9de8beeea0898 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 16:16:33 +0200 Subject: [PATCH 11/42] fix many of the rust tests --- .../ops/sort/arg_sort_multiple.rs | 19 ++--- crates/polars-core/src/frame/column.rs | 22 +++++ .../src/frame/group_by/into_groups.rs | 2 +- crates/polars-core/src/frame/group_by/mod.rs | 2 +- crates/polars-lazy/src/tests/aggregations.rs | 4 +- crates/polars-lazy/src/tests/arity.rs | 2 +- .../src/tests/optimization_checks.rs | 6 +- .../src/tests/predicate_queries.rs | 2 +- .../src/tests/projection_queries.rs | 2 +- crates/polars-lazy/src/tests/queries.rs | 48 +++++------ .../polars-mem-engine/src/executors/join.rs | 4 +- crates/polars-ops/src/chunked_array/mode.rs | 24 +++--- .../polars-ops/src/frame/join/iejoin/mod.rs | 28 +++---- crates/polars-ops/src/frame/join/mod.rs | 48 ++++++++--- crates/polars-ops/src/series/ops/abs.rs | 7 +- crates/polars-pipe/src/operators/chunks.rs | 10 ++- .../polars-plan/src/dsl/function_expr/abs.rs | 2 +- .../src/dsl/function_expr/dispatch.rs | 8 +- .../src/plans/conversion/type_coercion/mod.rs | 2 +- crates/polars-sql/tests/issues.rs | 4 +- crates/polars-sql/tests/simple_exprs.rs | 4 +- crates/polars-sql/tests/statements.rs | 4 +- crates/polars-sql/tests/udf.rs | 16 ++-- crates/polars-time/src/group_by/dynamic.rs | 29 +++---- crates/polars/src/docs/eager.rs | 17 ++-- crates/polars/src/docs/lazy.rs | 5 +- crates/polars/src/lib.rs | 2 +- crates/polars/tests/it/core/date_like.rs | 2 +- crates/polars/tests/it/core/joins.rs | 84 +++++++++++-------- crates/polars/tests/it/core/pivot.rs | 6 +- crates/polars/tests/it/io/csv.rs | 26 +++--- crates/polars/tests/it/io/ipc.rs | 6 +- crates/polars/tests/it/io/ipc_stream.rs | 2 +- crates/polars/tests/it/io/mod.rs | 4 +- crates/polars/tests/it/joins.rs | 6 +- crates/polars/tests/it/lazy/aggregation.rs | 8 +- crates/polars/tests/it/lazy/cwc.rs | 6 +- .../polars/tests/it/lazy/expressions/arity.rs | 4 +- .../tests/it/lazy/expressions/window.rs | 14 ++-- crates/polars/tests/it/lazy/exprs.rs | 12 +-- crates/polars/tests/it/lazy/group_by.rs | 4 +- crates/polars/tests/it/lazy/queries.rs | 14 ++-- 42 files changed, 287 insertions(+), 234 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs index 4fc01211f3df..5653039ff02e 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs @@ -106,7 +106,7 @@ pub fn _get_rows_encoded_compat_array(by: &Series) -> PolarsResult { Ok(out) } -pub fn encode_rows_vertical_par_unordered(by: &[Column]) -> PolarsResult { +pub fn encode_rows_vertical_par_unordered(by: &[Series]) -> PolarsResult { let n_threads = POOL.current_num_threads(); let len = by[0].len(); let splits = _split_offsets(len, n_threads); @@ -129,7 +129,7 @@ pub fn encode_rows_vertical_par_unordered(by: &[Column]) -> PolarsResult PolarsResult { let n_threads = POOL.current_num_threads(); let len = by[0].len(); @@ -138,15 +138,14 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls( let chunks = splits.into_par_iter().map(|(offset, len)| { let sliced = by .iter() - .map(|s| s.as_materialized_series().slice(offset as i64, len)) - .map(Column::from) + .map(|s| s.slice(offset as i64, len)) .collect::>(); let rows = _get_rows_encoded_unordered(&sliced)?; let validities = sliced .iter() - .flat_map(|c| { - let s = c.as_materialized_series().rechunk(); + .flat_map(|s| { + let s = s.rechunk(); #[allow(clippy::unnecessary_to_owned)] s.chunks() .to_vec() @@ -166,7 +165,7 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls( )) } -pub(crate) fn encode_rows_unordered(by: &[Column]) -> PolarsResult { +pub(crate) fn encode_rows_unordered(by: &[Series]) -> PolarsResult { let rows = _get_rows_encoded_unordered(by)?; Ok(BinaryOffsetChunked::with_chunk( PlSmallStr::EMPTY, @@ -174,11 +173,11 @@ pub(crate) fn encode_rows_unordered(by: &[Column]) -> PolarsResult PolarsResult { +pub fn _get_rows_encoded_unordered(by: &[Series]) -> PolarsResult { let mut cols = Vec::with_capacity(by.len()); let mut fields = Vec::with_capacity(by.len()); for by in by { - let arr = _get_rows_encoded_compat_array(by.as_materialized_series())?; + let arr = _get_rows_encoded_compat_array(by)?; let field = EncodingField::new_unsorted(); match arr.dtype() { // Flatten the struct fields. @@ -256,7 +255,7 @@ pub fn _get_rows_encoded_arr( pub fn _get_rows_encoded_ca_unordered( name: PlSmallStr, - by: &[Column], + by: &[Series], ) -> PolarsResult { _get_rows_encoded_unordered(by) .map(|rows| BinaryOffsetChunked::with_chunk(name, rows.into_array())) diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index fd845c518d06..71344b3e9005 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -792,6 +792,28 @@ impl Sub for &Column { } } +impl Mul for Column { + type Output = PolarsResult; + + fn mul(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .mul(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Mul for &Column { + type Output = PolarsResult; + + fn mul(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .mul(rhs.as_materialized_series()) + .map(Column::from) + } +} + impl Sub for &Column where T: Num + NumCast, diff --git a/crates/polars-core/src/frame/group_by/into_groups.rs b/crates/polars-core/src/frame/group_by/into_groups.rs index 519d0d2d0b0e..bdaa439a1232 100644 --- a/crates/polars-core/src/frame/group_by/into_groups.rs +++ b/crates/polars-core/src/frame/group_by/into_groups.rs @@ -320,7 +320,7 @@ impl IntoGroupsProxy for ListChunked { sorted: bool, ) -> PolarsResult { multithreaded &= POOL.current_num_threads() > 1; - let by = &[self.clone().into_column()]; + let by = &[self.clone().into_series()]; let ca = if multithreaded { encode_rows_vertical_par_unordered(by).unwrap() } else { diff --git a/crates/polars-core/src/frame/group_by/mod.rs b/crates/polars-core/src/frame/group_by/mod.rs index e02d6069c89a..322c6b967e33 100644 --- a/crates/polars-core/src/frame/group_by/mod.rs +++ b/crates/polars-core/src/frame/group_by/mod.rs @@ -74,7 +74,7 @@ impl DataFrame { let by = by .iter() .filter(|s| !s.dtype().is_null()) - .cloned() + .map(|c| c.as_materialized_series().clone()) .collect::>(); if by.is_empty() { let groups = if self.is_empty() { diff --git a/crates/polars-lazy/src/tests/aggregations.rs b/crates/polars-lazy/src/tests/aggregations.rs index 54387451a8b7..6b2d8cb05da0 100644 --- a/crates/polars-lazy/src/tests/aggregations.rs +++ b/crates/polars-lazy/src/tests/aggregations.rs @@ -63,12 +63,12 @@ fn test_agg_unique_first() -> PolarsResult<()> { .collect()?; let a = out.column("v_first").unwrap(); - let a = a.sum::().unwrap(); + let a = a.as_materialized_series().sum::().unwrap(); // can be both because unique does not guarantee order assert!(a == 10 || a == 11); let a = out.column("true_first").unwrap(); - let a = a.sum::().unwrap(); + let a = a.as_materialized_series().sum::().unwrap(); // can be both because unique does not guarantee order assert_eq!(a, 10); diff --git a/crates/polars-lazy/src/tests/arity.rs b/crates/polars-lazy/src/tests/arity.rs index c6f7b4381b53..439d2a8be587 100644 --- a/crates/polars-lazy/src/tests/arity.rs +++ b/crates/polars-lazy/src/tests/arity.rs @@ -72,5 +72,5 @@ fn test_lazy_ternary() { ) .collect() .unwrap(); - assert_eq!(43, df.column("new").unwrap().sum::().unwrap()); + assert_eq!(43, df.column("new").unwrap().as_materialized_series().sum::().unwrap()); } diff --git a/crates/polars-lazy/src/tests/optimization_checks.rs b/crates/polars-lazy/src/tests/optimization_checks.rs index e01ad342f061..4a99413d48cc 100644 --- a/crates/polars-lazy/src/tests/optimization_checks.rs +++ b/crates/polars-lazy/src/tests/optimization_checks.rs @@ -310,7 +310,7 @@ pub fn test_predicate_block_cast() -> PolarsResult<()> { let s = out.column("value").unwrap(); assert_eq!( s, - &Series::new(PlSmallStr::from_static("value"), [1.0f32, 2.0]) + &Column::new(PlSmallStr::from_static("value"), [1.0f32, 2.0]) ); } @@ -325,7 +325,7 @@ fn test_lazy_filter_and_rename() { .lazy() .rename(["a"], ["x"]) .filter(col("x").map( - |s: Series| Ok(Some(s.gt(3)?.into_series())), + |s: Column| Ok(Some(s.as_materialized_series().gt(3)?.into_column())), GetOutput::from_type(DataType::Boolean), )) .select([col("x")]); @@ -338,7 +338,7 @@ fn test_lazy_filter_and_rename() { // now we check if the column is rename or added when we don't select let lf = df.lazy().rename(["a"], ["x"]).filter(col("x").map( - |s: Series| Ok(Some(s.gt(3)?.into_series())), + |s: Column| Ok(Some(s.as_materialized_series().gt(3)?.into_column())), GetOutput::from_type(DataType::Boolean), )); // the rename function should not interfere with the predicate pushdown diff --git a/crates/polars-lazy/src/tests/predicate_queries.rs b/crates/polars-lazy/src/tests/predicate_queries.rs index 855d9463f814..71d24d1207e1 100644 --- a/crates/polars-lazy/src/tests/predicate_queries.rs +++ b/crates/polars-lazy/src/tests/predicate_queries.rs @@ -72,7 +72,7 @@ fn test_pass_unrelated_apply() -> PolarsResult<()> { let q = df .lazy() .with_column(col("A").map( - |s| Ok(Some(s.is_null().into_series())), + |s| Ok(Some(s.is_null().into_column())), GetOutput::from_type(DataType::Boolean), )) .filter(col("B").gt(lit(10i32))); diff --git a/crates/polars-lazy/src/tests/projection_queries.rs b/crates/polars-lazy/src/tests/projection_queries.rs index b2cff519c05a..d1594a461a86 100644 --- a/crates/polars-lazy/src/tests/projection_queries.rs +++ b/crates/polars-lazy/src/tests/projection_queries.rs @@ -130,7 +130,7 @@ fn concat_str_regex_expansion() -> PolarsResult<()> { let s = out.column("concatenated")?; assert_eq!( s, - &Series::new("concatenated".into(), ["a--;;", ";b--;", ";;c--"]) + &Column::new("concatenated".into(), ["a--;;", ";b--;", ";;c--"]) ); Ok(()) diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index d1566cbb8680..5bdc32651860 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -219,7 +219,7 @@ fn test_lazy_ternary_and_predicates() { let length = new.column("sepal_length").unwrap(); assert_eq!( length, - &Series::new("sepal_length".into(), &[5.1f64, 5.0, 5.4]) + &Column::new("sepal_length".into(), &[5.1f64, 5.0, 5.4]) ); assert_eq!(new.shape(), (3, 6)); } @@ -232,7 +232,7 @@ fn test_lazy_binary_ops() { .select([col("a").eq(lit(2)).alias("foo")]) .collect() .unwrap(); - assert_eq!(new.column("foo").unwrap().sum::().unwrap(), 1); + assert_eq!(new.column("foo").unwrap().as_materialized_series().sum::().unwrap(), 1); } #[test] @@ -277,7 +277,7 @@ fn test_lazy_query_4() -> PolarsResult<()> { col("day").alias("day"), col("cumcases") .apply( - |s: Series| (&s - &(s.shift(1))).map(Some), + |s: Column| (&s - &(s.shift(1))).map(Some), GetOutput::same_type(), ) .alias("diff_cases"), @@ -414,7 +414,7 @@ fn test_lazy_query_9() -> PolarsResult<()> { fn test_lazy_query_10() { use polars_core::export::chrono::Duration as ChronoDuration; let date = NaiveDate::from_ymd_opt(2021, 3, 5).unwrap(); - let x: Series = DatetimeChunked::from_naive_datetime( + let x = DatetimeChunked::from_naive_datetime( "x".into(), [ NaiveDateTime::new(date, NaiveTime::from_hms_opt(12, 0, 0).unwrap()), @@ -423,8 +423,8 @@ fn test_lazy_query_10() { ], TimeUnit::Nanoseconds, ) - .into(); - let y: Series = DatetimeChunked::from_naive_datetime( + .into_column(); + let y = DatetimeChunked::from_naive_datetime( "y".into(), [ NaiveDateTime::new(date, NaiveTime::from_hms_opt(11, 0, 0).unwrap()), @@ -433,14 +433,14 @@ fn test_lazy_query_10() { ], TimeUnit::Nanoseconds, ) - .into(); + .into_column(); let df = DataFrame::new(vec![x, y]).unwrap(); let out = df .lazy() .select(&[(col("x") - col("y")).alias("z")]) .collect() .unwrap(); - let z: Series = DurationChunked::from_duration( + let z = DurationChunked::from_duration( "z".into(), [ ChronoDuration::try_hours(1).unwrap(), @@ -449,9 +449,9 @@ fn test_lazy_query_10() { ], TimeUnit::Nanoseconds, ) - .into(); + .into_column(); assert!(out.column("z").unwrap().equals(&z)); - let x: Series = DatetimeChunked::from_naive_datetime( + let x = DatetimeChunked::from_naive_datetime( "x".into(), [ NaiveDateTime::new(date, NaiveTime::from_hms_opt(2, 0, 0).unwrap()), @@ -460,8 +460,8 @@ fn test_lazy_query_10() { ], TimeUnit::Milliseconds, ) - .into(); - let y: Series = DatetimeChunked::from_naive_datetime( + .into_column(); + let y = DatetimeChunked::from_naive_datetime( "y".into(), [ NaiveDateTime::new(date, NaiveTime::from_hms_opt(1, 0, 0).unwrap()), @@ -470,7 +470,7 @@ fn test_lazy_query_10() { ], TimeUnit::Nanoseconds, ) - .into(); + .into_column(); let df = DataFrame::new(vec![x, y]).unwrap(); let out = df .lazy() @@ -501,8 +501,8 @@ fn test_lazy_query_7() { ]; let data = vec![Some(1.), Some(2.), Some(3.), Some(4.), None, None]; let df = DataFrame::new(vec![ - DatetimeChunked::from_naive_datetime("date".into(), dates, TimeUnit::Nanoseconds).into(), - Series::new("data".into(), data), + DatetimeChunked::from_naive_datetime("date".into(), dates, TimeUnit::Nanoseconds).into_column(), + Column::new("data".into(), data), ]) .unwrap(); // this tests if predicate pushdown not interferes with the shift data. @@ -516,14 +516,14 @@ fn test_lazy_query_7() { )))) .collect() .unwrap(); - let a = out.column("shifted").unwrap().sum::().unwrap() - 7.0; + let a = out.column("shifted").unwrap().as_materialized_series().sum::().unwrap() - 7.0; assert!(a < 0.01 && a > -0.01); } #[test] fn test_lazy_shift_and_fill_all() { let data = &[1, 2, 3]; - let df = DataFrame::new(vec![Series::new("data".into(), data)]).unwrap(); + let df = DataFrame::new(vec![Column::new("data".into(), data)]).unwrap(); let out = df .lazy() .with_column(col("data").shift(lit(1)).fill_null(lit(0)).alias("output")) @@ -714,7 +714,7 @@ fn test_lazy_group_by_apply() { df.lazy() .group_by([col("fruits")]) .agg([col("cars").apply( - |s: Series| Ok(Some(Series::new("".into(), &[s.len() as u32]))), + |s: Column| Ok(Some(Column::new("".into(), &[s.len() as u32]))), GetOutput::from_type(DataType::UInt32), )]) .collect() @@ -1120,7 +1120,7 @@ fn test_filter_lit() { // see https://github.com/pola-rs/polars/issues/790 // failed due to broadcasting filters and splitting threads. let iter = (0..100).map(|i| ('A'..='Z').nth(i % 26).unwrap().to_string()); - let a = Series::from_iter(iter); + let a = Series::from_iter(iter).into_column(); let df = DataFrame::new([a].into()).unwrap(); let out = df.lazy().filter(lit(true)).collect().unwrap(); @@ -1471,10 +1471,10 @@ fn test_singleton_broadcast() -> PolarsResult<()> { #[test] fn test_list_in_select_context() -> PolarsResult<()> { - let s = Series::new("a".into(), &[1, 2, 3]); + let s = Column::new("a".into(), &[1, 2, 3]); let mut builder = get_list_builder(s.dtype(), s.len(), 1, s.name().clone()).unwrap(); - builder.append_series(&s).unwrap(); - let expected = builder.finish().into_series(); + builder.append_series(s.as_materialized_series()).unwrap(); + let expected = builder.finish().into_column(); let df = DataFrame::new(vec![s])?; @@ -1549,8 +1549,8 @@ fn test_round_after_agg() -> PolarsResult<()> { #[test] #[cfg(feature = "dtype-date")] fn test_fill_nan() -> PolarsResult<()> { - let s0 = Series::new("date".into(), &[1, 2, 3]).cast(&DataType::Date)?; - let s1 = Series::new("float".into(), &[Some(1.0), Some(f32::NAN), Some(3.0)]); + let s0 = Column::new("date".into(), &[1, 2, 3]).cast(&DataType::Date)?; + let s1 = Column::new("float".into(), &[Some(1.0), Some(f32::NAN), Some(3.0)]); let df = DataFrame::new(vec![s0, s1])?; let out = df.lazy().fill_nan(Null {}.lit()).collect()?; diff --git a/crates/polars-mem-engine/src/executors/join.rs b/crates/polars-mem-engine/src/executors/join.rs index 0c6ea2bd5f0a..5edab8551ece 100644 --- a/crates/polars-mem-engine/src/executors/join.rs +++ b/crates/polars-mem-engine/src/executors/join.rs @@ -88,13 +88,13 @@ impl Executor for JoinExec { let left_on_series = self .left_on .iter() - .map(|e| e.evaluate(&df_left, state).map(Column::from)) + .map(|e| e.evaluate(&df_left, state)) .collect::>>()?; let right_on_series = self .right_on .iter() - .map(|e| e.evaluate(&df_right, state).map(Column::from)) + .map(|e| e.evaluate(&df_right, state)) .collect::>>()?; // prepare the tolerance diff --git a/crates/polars-ops/src/chunked_array/mode.rs b/crates/polars-ops/src/chunked_array/mode.rs index 3c2c3025506f..a36b161775ca 100644 --- a/crates/polars-ops/src/chunked_array/mode.rs +++ b/crates/polars-ops/src/chunked_array/mode.rs @@ -19,15 +19,13 @@ where } fn mode_f32(ca: &Float32Chunked) -> PolarsResult { - // @scalar-opt - let s = ca.apply_as_ints(|v| mode(&v.clone().into()).unwrap().as_materialized_series().clone()); + let s = ca.apply_as_ints(|v| mode(v).unwrap()); let ca = s.f32().unwrap().clone(); Ok(ca) } fn mode_64(ca: &Float64Chunked) -> PolarsResult { - // @scalar-opt - let s = ca.apply_as_ints(|v| mode(&v.clone().into()).unwrap().as_materialized_series().clone()); + let s = ca.apply_as_ints(|v| mode(v).unwrap()); let ca = s.f64().unwrap().clone(); Ok(ca) } @@ -63,18 +61,18 @@ fn mode_indices(groups: GroupsProxy) -> Vec { } } -pub fn mode(s: &Column) -> PolarsResult { +pub fn mode(s: &Series) -> PolarsResult { let s_phys = s.to_physical_repr(); let out = match s_phys.dtype() { - DataType::Binary => mode_primitive(s_phys.binary().unwrap())?.into_column(), - DataType::Boolean => mode_primitive(s_phys.bool().unwrap())?.into_column(), - DataType::Float32 => mode_f32(s_phys.f32().unwrap())?.into_column(), - DataType::Float64 => mode_64(s_phys.f64().unwrap())?.into_column(), - DataType::String => mode_primitive(&s_phys.str().unwrap().as_binary())?.into_column(), + DataType::Binary => mode_primitive(s_phys.binary().unwrap())?.into_series(), + DataType::Boolean => mode_primitive(s_phys.bool().unwrap())?.into_series(), + DataType::Float32 => mode_f32(s_phys.f32().unwrap())?.into_series(), + DataType::Float64 => mode_64(s_phys.f64().unwrap())?.into_series(), + DataType::String => mode_primitive(&s_phys.str().unwrap().as_binary())?.into_series(), dt if dt.is_integer() => { with_match_physical_integer_polars_type!(dt, |$T| { - let ca: &ChunkedArray<$T> = s_phys.as_materialized_series().as_ref().as_ref().as_ref(); - mode_primitive(ca)?.into_column() + let ca: &ChunkedArray<$T> = s_phys.as_ref().as_ref().as_ref(); + mode_primitive(ca)?.into_series() }) }, _ => polars_bail!(opq = mode, s.dtype()), @@ -122,7 +120,7 @@ mod test { ca_builder.append_value("test2"); ca_builder.append_value("test2"); ca_builder.append_value("test2"); - let s = ca_builder.finish().into_column(); + let s = ca_builder.finish().into_series(); let result = mode(&s).unwrap(); assert_eq!(result.str_value(0).unwrap(), "test2"); assert_eq!(result.len(), 1); diff --git a/crates/polars-ops/src/frame/join/iejoin/mod.rs b/crates/polars-ops/src/frame/join/iejoin/mod.rs index 5d655ca5ee22..cd19f7049abe 100644 --- a/crates/polars-ops/src/frame/join/iejoin/mod.rs +++ b/crates/polars-ops/src/frame/join/iejoin/mod.rs @@ -205,8 +205,8 @@ where pub(super) fn iejoin_par( left: &DataFrame, right: &DataFrame, - selected_left: Vec, - selected_right: Vec, + selected_left: Vec, + selected_right: Vec, options: &IEJoinOptions, suffix: Option, slice: Option<(i64, usize)>, @@ -221,12 +221,12 @@ pub(super) fn iejoin_par( .with_nulls_last(false) .with_order_descending(l1_descending); - let sl = &selected_left[0].as_materialized_series(); + let sl = &selected_left[0]; let l1_s_l = sl .arg_sort(l1_sort_options) .slice(sl.null_count() as i64, sl.len() - sl.null_count()); - let sr = &selected_right[0].as_materialized_series(); + let sr = &selected_right[0]; let l1_s_r = sr .arg_sort(l1_sort_options) .slice(sr.null_count() as i64, sr.len() - sr.null_count()); @@ -282,11 +282,11 @@ pub(super) fn iejoin_par( ( selected_left .iter() - .map(|s| s.as_materialized_series().take_unchecked(l_l1_idx).into()) + .map(|s| s.take_unchecked(l_l1_idx)) .collect_vec(), selected_right .iter() - .map(|s| s.as_materialized_series().take_unchecked(r_l1_idx).into()) + .map(|s| s.take_unchecked(r_l1_idx)) .collect_vec(), ) }; @@ -342,8 +342,8 @@ pub(super) fn iejoin_par( pub(super) fn iejoin( left: &DataFrame, right: &DataFrame, - selected_left: Vec, - selected_right: Vec, + selected_left: Vec, + selected_right: Vec, options: &IEJoinOptions, suffix: Option, slice: Option<(i64, usize)>, @@ -378,8 +378,8 @@ unsafe fn materialize_join( /// Based on Khayyat et al. 2015, "Lightning Fast and Space Efficient Inequality Joins" /// and extended to work with duplicate values. fn iejoin_tuples( - selected_left: Vec, - selected_right: Vec, + selected_left: Vec, + selected_right: Vec, options: &IEJoinOptions, slice: Option<(i64, usize)>, ) -> PolarsResult<(IdxCa, IdxCa)> { @@ -411,14 +411,14 @@ fn iejoin_tuples( let l1_descending = matches!(op1, InequalityOperator::Gt | InequalityOperator::GtEq); let l2_descending = matches!(op2, InequalityOperator::Lt | InequalityOperator::LtEq); - let mut x = selected_left[0].to_physical_repr(); + let mut x = selected_left[0].to_physical_repr().into_owned(); let left_height = x.len(); x.extend(&selected_right[0].to_physical_repr())?; // Rechunk because we will gather. let x = x.rechunk(); - let mut y = selected_left[1].to_physical_repr(); + let mut y = selected_left[1].to_physical_repr().into_owned(); y.extend(&selected_right[1].to_physical_repr())?; // Rechunk because we will gather. let y = y.rechunk(); @@ -432,7 +432,7 @@ fn iejoin_tuples( .arg_sort(l1_sort_options) .slice(x.null_count() as i64, x.len() - x.null_count()); - let y_ordered_by_x = unsafe { y.as_materialized_series().take_unchecked(&l1_order) }; + let y_ordered_by_x = unsafe { y.take_unchecked(&l1_order) }; let l2_sort_options = SortOptions::default() .with_maintain_order(true) .with_nulls_last(false) @@ -455,7 +455,7 @@ fn iejoin_tuples( l2_order, op1, op2, - x.as_materialized_series().clone(), + x, y_ordered_by_x, left_height ) diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index 2cd9bd323690..cbfcec5ea3f1 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -95,6 +95,16 @@ pub trait DataFrameJoinOps: IntoDf { let df_left = self.to_df(); let selected_left = df_left.select_columns(left_on)?; let selected_right = other.select_columns(right_on)?; + + let selected_left = selected_left + .into_iter() + .map(Column::take_materialized_series) + .collect::>(); + let selected_right = selected_right + .into_iter() + .map(Column::take_materialized_series) + .collect::>(); + self._join_impl(other, selected_left, selected_right, args, true, false) } @@ -104,8 +114,8 @@ pub trait DataFrameJoinOps: IntoDf { fn _join_impl( &self, other: &DataFrame, - mut selected_left: Vec, - mut selected_right: Vec, + mut selected_left: Vec, + mut selected_right: Vec, mut args: JoinArgs, _check_rechunk: bool, _verbose: bool, @@ -118,7 +128,7 @@ pub trait DataFrameJoinOps: IntoDf { } // Clear literals if a frame is empty. Otherwise we could get an oob - fn clear(s: &mut [Column]) { + fn clear(s: &mut [Series]) { for s in s.iter_mut() { if s.len() == 1 { *s = s.clear() @@ -195,8 +205,8 @@ pub trait DataFrameJoinOps: IntoDf { Err(_) => { let (ca_left, ca_right) = make_categoricals_compatible(l.categorical()?, r.categorical()?)?; - *l = ca_left.into_column().with_name(l.name().clone()); - *r = ca_right.into_column().with_name(r.name().clone()); + *l = ca_left.into_series().with_name(l.name().clone()); + *r = ca_right.into_series().with_name(r.name().clone()); }, } } @@ -222,8 +232,8 @@ pub trait DataFrameJoinOps: IntoDf { // Single keys. if selected_left.len() == 1 { - let s_left = &selected_left[0].as_materialized_series(); - let s_right = &selected_right[0].as_materialized_series(); + let s_left = &selected_left[0]; + let s_right = &selected_right[0]; let drop_names: Option> = if should_coalesce { None } else { Some(vec![]) }; return match args.how { @@ -513,15 +523,15 @@ trait DataFrameJoinOpsPrivate: IntoDf { impl DataFrameJoinOps for DataFrame {} impl DataFrameJoinOpsPrivate for DataFrame {} -fn prepare_keys_multiple(s: &[Column], join_nulls: bool) -> PolarsResult { +fn prepare_keys_multiple(s: &[Series], join_nulls: bool) -> PolarsResult { let keys = s .iter() .map(|s| { let phys = s.to_physical_repr(); match phys.dtype() { - DataType::Float32 => phys.f32().unwrap().to_canonical().into_column(), - DataType::Float64 => phys.f64().unwrap().to_canonical().into_column(), - _ => phys, + DataType::Float32 => phys.f32().unwrap().to_canonical().into_series(), + DataType::Float64 => phys.f64().unwrap().to_canonical().into_series(), + _ => phys.into_owned(), } }) .collect::>(); @@ -537,7 +547,19 @@ pub fn private_left_join_multiple_keys( b: &DataFrame, join_nulls: bool, ) -> PolarsResult { - let a = prepare_keys_multiple(a.get_columns(), join_nulls)?.into_series(); - let b = prepare_keys_multiple(b.get_columns(), join_nulls)?.into_series(); + // @scalar-opt + let a_cols = a + .get_columns() + .iter() + .map(|c| c.as_materialized_series().clone()) + .collect::>(); + let b_cols = b + .get_columns() + .iter() + .map(|c| c.as_materialized_series().clone()) + .collect::>(); + + let a = prepare_keys_multiple(&a_cols, join_nulls)?.into_series(); + let b = prepare_keys_multiple(&b_cols, join_nulls)?.into_series(); sort_or_hash_left(&a, &b, false, JoinValidation::ManyToMany, join_nulls) } diff --git a/crates/polars-ops/src/series/ops/abs.rs b/crates/polars-ops/src/series/ops/abs.rs index 21a1213ae1d1..5a84678df591 100644 --- a/crates/polars-ops/src/series/ops/abs.rs +++ b/crates/polars-ops/src/series/ops/abs.rs @@ -1,10 +1,7 @@ use polars_core::prelude::*; /// Convert numerical values to their absolute value. -pub fn abs(c: &Column) -> PolarsResult { - // @scalar-opt - let s = c.as_materialized_series(); - +pub fn abs(s: &Series) -> PolarsResult { use DataType::*; let out = match s.dtype() { #[cfg(feature = "dtype-i8")] @@ -34,5 +31,5 @@ pub fn abs(c: &Column) -> PolarsResult { dt if dt.is_unsigned_integer() => s.clone(), dt => polars_bail!(opq = abs, dt), }; - Ok(out.into()) + Ok(out) } diff --git a/crates/polars-pipe/src/operators/chunks.rs b/crates/polars-pipe/src/operators/chunks.rs index 593936c82b81..c1f63019a611 100644 --- a/crates/polars-pipe/src/operators/chunks.rs +++ b/crates/polars-pipe/src/operators/chunks.rs @@ -138,7 +138,7 @@ mod test { .iter() .enumerate() .map(|(i, length)| { - let series = Series::new("val".into(), vec![i as u64; *length]); + let series = Column::new("val".into(), vec![i as u64; *length]); DataFrame::new(vec![series]).unwrap() }) .collect(); @@ -167,7 +167,13 @@ mod test { } } // Make sure all result DataFrames only have a single chunk. - assert_eq!(result_df.get_columns()[0].chunk_lengths().len(), 1); + assert_eq!( + result_df.get_columns()[0] + .as_materialized_series() + .chunk_lengths() + .len(), + 1 + ); } // Make sure the data was preserved: diff --git a/crates/polars-plan/src/dsl/function_expr/abs.rs b/crates/polars-plan/src/dsl/function_expr/abs.rs index 1f0435e05772..5464f06daada 100644 --- a/crates/polars-plan/src/dsl/function_expr/abs.rs +++ b/crates/polars-plan/src/dsl/function_expr/abs.rs @@ -1,5 +1,5 @@ use super::*; pub(super) fn abs(s: &Column) -> PolarsResult { - polars_ops::prelude::abs(s) + polars_ops::prelude::abs(s.as_materialized_series()).map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/dispatch.rs b/crates/polars-plan/src/dsl/function_expr/dispatch.rs index 06d44a74f382..fb73f51ab5de 100644 --- a/crates/polars-plan/src/dsl/function_expr/dispatch.rs +++ b/crates/polars-plan/src/dsl/function_expr/dispatch.rs @@ -122,7 +122,7 @@ pub(super) fn drop_nulls(s: &Column) -> PolarsResult { #[cfg(feature = "mode")] pub(super) fn mode(s: &Column) -> PolarsResult { - mode::mode(s) + mode::mode(s.as_materialized_series()).map(Column::from) } #[cfg(feature = "moment")] @@ -160,11 +160,7 @@ pub(super) fn hist( include_category: bool, include_breakpoint: bool, ) -> PolarsResult { - let bins = if s.len() == 2 { - Some(&s[1]) - } else { - None - }; + let bins = if s.len() == 2 { Some(&s[1]) } else { None }; let s = s[0].as_materialized_series(); hist_series( s, diff --git a/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs b/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs index ceb3d7dffd49..fc8f520e86ea 100644 --- a/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs +++ b/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs @@ -531,7 +531,7 @@ mod test { let optimizer = StackOptimizer {}; let rules: &mut [Box] = &mut [Box::new(TypeCoercionRule {})]; - let df = DataFrame::new(Vec::from([Series::new_empty( + let df = DataFrame::new(Vec::from([Column::new_empty( PlSmallStr::from_static("fruits"), &DataType::Categorical(None, Default::default()), )])) diff --git a/crates/polars-sql/tests/issues.rs b/crates/polars-sql/tests/issues.rs index 10ee22db49d3..7938266dc463 100644 --- a/crates/polars-sql/tests/issues.rs +++ b/crates/polars-sql/tests/issues.rs @@ -99,8 +99,6 @@ fn iss_7440() { #[test] #[cfg(feature = "csv")] fn iss_8395() -> PolarsResult<()> { - use polars_core::series::Series; - let mut context = SQLContext::new(); let sql = r#" with foods as ( @@ -113,7 +111,7 @@ fn iss_8395() -> PolarsResult<()> { // assert that the df only contains [vegetables, seafood] let s = df.column("category")?.unique()?.sort(Default::default())?; - let expected = Series::new("category".into(), &["seafood", "vegetables"]); + let expected = Column::new("category".into(), &["seafood", "vegetables"]); assert!(s.equals(&expected)); Ok(()) } diff --git a/crates/polars-sql/tests/simple_exprs.rs b/crates/polars-sql/tests/simple_exprs.rs index b84c6e681cd2..64b46c88656e 100644 --- a/crates/polars-sql/tests/simple_exprs.rs +++ b/crates/polars-sql/tests/simple_exprs.rs @@ -4,11 +4,11 @@ use polars_sql::*; use polars_time::Duration; fn create_sample_df() -> DataFrame { - let a = Series::new( + let a = Column::new( "a".into(), (1..10000i64).map(|i| i / 100).collect::>(), ); - let b = Series::new("b".into(), 1..10000i64); + let b = Column::new("b".into(), 1..10000i64); DataFrame::new(vec![a, b]).unwrap() } diff --git a/crates/polars-sql/tests/statements.rs b/crates/polars-sql/tests/statements.rs index 2657ec443077..c4af146eab9d 100644 --- a/crates/polars-sql/tests/statements.rs +++ b/crates/polars-sql/tests/statements.rs @@ -3,8 +3,8 @@ use polars_lazy::prelude::*; use polars_sql::*; fn create_ctx() -> SQLContext { - let a = Series::new("a".into(), (1..10i64).map(|i| i / 100).collect::>()); - let b = Series::new("b".into(), 1..10i64); + let a = Column::new("a".into(), (1..10i64).map(|i| i / 100).collect::>()); + let b = Column::new("b".into(), 1..10i64); let df = DataFrame::new(vec![a, b]).unwrap().lazy(); let mut ctx = SQLContext::new(); ctx.register("df", df); diff --git a/crates/polars-sql/tests/udf.rs b/crates/polars-sql/tests/udf.rs index 3ccd1c4d6395..d8d4eec83d9c 100644 --- a/crates/polars-sql/tests/udf.rs +++ b/crates/polars-sql/tests/udf.rs @@ -39,10 +39,10 @@ fn test_udfs() -> PolarsResult<()> { Field::new("b".into(), DataType::Int32), ], GetOutput::same_type(), - move |s: &mut [Series]| { - let first = s[0].clone(); - let second = s[1].clone(); - (first + second).map(Some) + move |c: &mut [Column]| { + let first = c[0].as_materialized_series().clone(); + let second = c[1].as_materialized_series().clone(); + (first + second).map(Column::from).map(Some) }, ); @@ -74,10 +74,10 @@ fn test_udfs() -> PolarsResult<()> { Field::new("b".into(), DataType::Int32), ], GetOutput::same_type(), - move |s: &mut [Series]| { - let first = s[0].clone(); - let second = s[1].clone(); - (first / second).map(Some) + move |c: &mut [Column]| { + let first = c[0].as_materialized_series().clone(); + let second = c[1].as_materialized_series().clone(); + (first / second).map(Column::from).map(Some) }, ); diff --git a/crates/polars-time/src/group_by/dynamic.rs b/crates/polars-time/src/group_by/dynamic.rs index 8127d73ad432..480c678e920c 100644 --- a/crates/polars-time/src/group_by/dynamic.rs +++ b/crates/polars-time/src/group_by/dynamic.rs @@ -197,7 +197,8 @@ impl Wrap<&DataFrame> { if group_by.is_empty() { // If by is given, the column must be sorted in the 'by' arg, which we can not check now // this will be checked when the groups are materialized. - time.as_materialized_series().ensure_sorted_arg("group_by_dynamic")?; + time.as_materialized_series() + .ensure_sorted_arg("group_by_dynamic")?; } let time_type = time.dtype(); @@ -755,43 +756,43 @@ mod test { ) .unwrap(); - let nulls = Column::new( + let nulls = Series::new( "".into(), [Some(3), Some(7), None, Some(9), Some(2), Some(1)], ); - let min = unsafe { a.agg_min(&groups) }; - let expected = Column::new("".into(), [3, 3, 3, 3, 2, 1]); + let min = unsafe { a.as_materialized_series().agg_min(&groups) }; + let expected = Series::new("".into(), [3, 3, 3, 3, 2, 1]); assert_eq!(min, expected); // Expected for nulls is equality. let min = unsafe { nulls.agg_min(&groups) }; assert_eq!(min, expected); - let max = unsafe { a.agg_max(&groups) }; - let expected = Column::new("".into(), [3, 7, 7, 9, 9, 1]); + let max = unsafe { a.as_materialized_series().agg_max(&groups) }; + let expected = Series::new("".into(), [3, 7, 7, 9, 9, 1]); assert_eq!(max, expected); let max = unsafe { nulls.agg_max(&groups) }; assert_eq!(max, expected); - let var = unsafe { a.agg_var(&groups, 1) }; - let expected = Column::new( + let var = unsafe { a.as_materialized_series().agg_var(&groups, 1) }; + let expected = Series::new( "".into(), [0.0, 8.0, 4.000000000000002, 6.666666666666667, 24.5, 0.0], ); - assert!(abs(&(var - expected)?.as_materialized_series()).unwrap().lt(1e-12).unwrap().all()); + assert!(abs(&(var - expected)?).unwrap().lt(1e-12).unwrap().all()); let var = unsafe { nulls.agg_var(&groups, 1) }; - let expected = Column::new("".into(), [0.0, 8.0, 8.0, 9.333333333333343, 24.5, 0.0]); - assert!(abs(&(var - expected)?.as_materialized_series()).unwrap().lt(1e-12).unwrap().all()); + let expected = Series::new("".into(), [0.0, 8.0, 8.0, 9.333333333333343, 24.5, 0.0]); + assert!(abs(&(var - expected)?).unwrap().lt(1e-12).unwrap().all()); - let quantile = unsafe { a.agg_quantile(&groups, 0.5, QuantileInterpolOptions::Linear) }; - let expected = Column::new("".into(), [3.0, 5.0, 5.0, 6.0, 5.5, 1.0]); + let quantile = unsafe { a.as_materialized_series().agg_quantile(&groups, 0.5, QuantileInterpolOptions::Linear) }; + let expected = Series::new("".into(), [3.0, 5.0, 5.0, 6.0, 5.5, 1.0]); assert_eq!(quantile, expected); let quantile = unsafe { nulls.agg_quantile(&groups, 0.5, QuantileInterpolOptions::Linear) }; - let expected = Column::new("".into(), [3.0, 5.0, 5.0, 7.0, 5.5, 1.0]); + let expected = Series::new("".into(), [3.0, 5.0, 5.0, 7.0, 5.5, 1.0]); assert_eq!(quantile, expected); Ok(()) diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs index 95c759f836e7..1285f2e2296a 100644 --- a/crates/polars/src/docs/eager.rs +++ b/crates/polars/src/docs/eager.rs @@ -251,11 +251,11 @@ //! # fn example() -> PolarsResult<()> { //! //! // apply a closure over all values -//! let s = Series::new("foo".into(), &[Some(1), Some(2), None]); +//! let s = Column::new("foo".into(), &[Some(1), Some(2), None]); //! s.i32()?.apply_values(|value| value * 20); //! //! // count string lengths -//! let s = Series::new("foo".into(), &["foo", "bar", "foobar"]); +//! let s = Column::new("foo".into(), &["foo", "bar", "foobar"]); //! unary_elementwise_values(s.str()?, |str_val| str_val.len() as u64); //! //! # Ok(()) @@ -474,7 +474,10 @@ //! "D" => &[2, 4, 6] //! ]?; //! -//! let unpivoted = df.unpivot(&["A", "B"], &["C", "D"]).unwrap(); +//! let unpivoted = df.unpivot( +//! &[PlSmallStr::from_static("A"), PlSmallStr::from_static("B")], +//! &[PlSmallStr::from_static("C"), PlSmallStr::from_static("D")], +//! ).unwrap(); //! // unpivoted: //! //! // +-----+-----+----------+-------+ @@ -510,14 +513,14 @@ //! let s1 = Series::new("b".into(), &[1i64, 1, 1]); //! let s2 = Series::new("c".into(), &[2i64, 2, 2]); //! // construct a new ListChunked for a slice of Series. -//! let list = Series::new("foo", &[s0, s1, s2]); +//! let list = Column::new("foo".into(), &[s0, s1, s2]); //! //! // construct a few more Series. -//! let s0 = Series::new("B".into(), [1, 2, 3]); -//! let s1 = Series::new("C".into(), [1, 1, 1]); +//! let s0 = Column::new("B".into(), [1, 2, 3]); +//! let s1 = Column::new("C".into(), [1, 1, 1]); //! let df = DataFrame::new(vec![list, s0, s1])?; //! -//! let exploded = df.explode(["foo"])?; +//! let exploded = df.explode([PlSmallStr::from("foo")])?; //! // exploded: //! //! // +-----+-----+-----+ diff --git a/crates/polars/src/docs/lazy.rs b/crates/polars/src/docs/lazy.rs index c91367490130..82a093e6c3ab 100644 --- a/crates/polars/src/docs/lazy.rs +++ b/crates/polars/src/docs/lazy.rs @@ -81,11 +81,8 @@ //! ]?; //! // sort this DataFrame by multiple columns //! -//! // ordering of the columns -//! let descending = vec![true, false]; -//! //! let sorted = df.lazy() -//! .sort_by_exprs(vec![col("b"), col("a")], descending, false, false) +//! .sort_by_exprs(vec![col("b"), col("a")]) //! .collect()?; //! //! // sorted: diff --git a/crates/polars/src/lib.rs b/crates/polars/src/lib.rs index 9910df124fa5..5ecc28c94c34 100644 --- a/crates/polars/src/lib.rs +++ b/crates/polars/src/lib.rs @@ -20,7 +20,7 @@ //! .agg([ //! // expressions can be combined into powerful aggregations //! col("foo") -//! .sort_by([col("ham").rank(Default::default(), None)], [false]) +//! .sort_by([col("ham").rank(Default::default(), None)], SortMultipleOptions::default()) //! .last() //! .alias("last_foo_ranked_by_ham"), //! // every expression runs in parallel diff --git a/crates/polars/tests/it/core/date_like.rs b/crates/polars/tests/it/core/date_like.rs index 7777d3fd1eb0..0d08c6079539 100644 --- a/crates/polars/tests/it/core/date_like.rs +++ b/crates/polars/tests/it/core/date_like.rs @@ -4,7 +4,7 @@ use super::*; #[cfg(feature = "dtype-datetime")] #[cfg_attr(miri, ignore)] fn test_datelike_join() -> PolarsResult<()> { - let s = Series::new("foo".into(), &[1, 2, 3]); + let s = Column::new("foo".into(), &[1, 2, 3]); let mut s1 = s.cast(&DataType::Datetime(TimeUnit::Nanoseconds, None))?; s1.rename("bar".into()); diff --git a/crates/polars/tests/it/core/joins.rs b/crates/polars/tests/it/core/joins.rs index fe4ec8ba78cb..9388a94d5960 100644 --- a/crates/polars/tests/it/core/joins.rs +++ b/crates/polars/tests/it/core/joins.rs @@ -39,13 +39,13 @@ fn test_chunked_left_join() -> PolarsResult<()> { } fn create_frames() -> (DataFrame, DataFrame) { - let s0 = Series::new("days".into(), &[0, 1, 2]); - let s1 = Series::new("temp".into(), &[22.1, 19.9, 7.]); - let s2 = Series::new("rain".into(), &[0.2, 0.1, 0.3]); + let s0 = Column::new("days".into(), &[0, 1, 2]); + let s1 = Column::new("temp".into(), &[22.1, 19.9, 7.]); + let s2 = Column::new("rain".into(), &[0.2, 0.1, 0.3]); let temp = DataFrame::new(vec![s0, s1, s2]).unwrap(); - let s0 = Series::new("days".into(), &[1, 2, 3, 1]); - let s1 = Series::new("rain".into(), &[0.1, 0.2, 0.3, 0.4]); + let s0 = Column::new("days".into(), &[1, 2, 3, 1]); + let s1 = Column::new("rain".into(), &[0.1, 0.2, 0.3, 0.4]); let rain = DataFrame::new(vec![s0, s1]).unwrap(); (temp, rain) } @@ -59,10 +59,10 @@ fn test_inner_join() { std::env::set_var("POLARS_MAX_THREADS", format!("{}", i)); let joined = temp.inner_join(&rain, ["days"], ["days"]).unwrap(); - let join_col_days = Series::new("days".into(), &[1, 2, 1]); - let join_col_temp = Series::new("temp".into(), &[19.9, 7., 19.9]); - let join_col_rain = Series::new("rain".into(), &[0.1, 0.3, 0.1]); - let join_col_rain_right = Series::new("rain_right".into(), [0.1, 0.2, 0.4].as_ref()); + let join_col_days = Column::new("days".into(), &[1, 2, 1]); + let join_col_temp = Column::new("temp".into(), &[19.9, 7., 19.9]); + let join_col_rain = Column::new("rain".into(), &[0.1, 0.3, 0.1]); + let join_col_rain_right = Column::new("rain_right".into(), [0.1, 0.2, 0.4].as_ref()); let true_df = DataFrame::new(vec![ join_col_days, join_col_temp, @@ -81,31 +81,45 @@ fn test_inner_join() { fn test_left_join() { for i in 1..8 { std::env::set_var("POLARS_MAX_THREADS", format!("{}", i)); - let s0 = Series::new("days".into(), &[0, 1, 2, 3, 4]); - let s1 = Series::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); + let s0 = Column::new("days".into(), &[0, 1, 2, 3, 4]); + let s1 = Column::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); let temp = DataFrame::new(vec![s0, s1]).unwrap(); - let s0 = Series::new("days".into(), &[1, 2]); - let s1 = Series::new("rain".into(), &[0.1, 0.2]); + let s0 = Column::new("days".into(), &[1, 2]); + let s1 = Column::new("rain".into(), &[0.1, 0.2]); let rain = DataFrame::new(vec![s0, s1]).unwrap(); let joined = temp.left_join(&rain, ["days"], ["days"]).unwrap(); assert_eq!( - (joined.column("rain").unwrap().sum::().unwrap() * 10.).round(), + (joined + .column("rain") + .unwrap() + .as_materialized_series() + .sum::() + .unwrap() + * 10.) + .round(), 3. ); assert_eq!(joined.column("rain").unwrap().null_count(), 3); // test join on string - let s0 = Series::new("days".into(), &["mo", "tue", "wed", "thu", "fri"]); - let s1 = Series::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); + let s0 = Column::new("days".into(), &["mo", "tue", "wed", "thu", "fri"]); + let s1 = Column::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); let temp = DataFrame::new(vec![s0, s1]).unwrap(); - let s0 = Series::new("days".into(), &["tue", "wed"]); - let s1 = Series::new("rain".into(), &[0.1, 0.2]); + let s0 = Column::new("days".into(), &["tue", "wed"]); + let s1 = Column::new("rain".into(), &[0.1, 0.2]); let rain = DataFrame::new(vec![s0, s1]).unwrap(); let joined = temp.left_join(&rain, ["days"], ["days"]).unwrap(); assert_eq!( - (joined.column("rain").unwrap().sum::().unwrap() * 10.).round(), + (joined + .column("rain") + .unwrap() + .as_materialized_series() + .sum::() + .unwrap() + * 10.) + .round(), 3. ); assert_eq!(joined.column("rain").unwrap().null_count(), 3); @@ -123,7 +137,7 @@ fn test_full_outer_join() -> PolarsResult<()> { JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), )?; assert_eq!(joined.height(), 5); - assert_eq!(joined.column("days")?.sum::().unwrap(), 7); + assert_eq!(joined.column("days")?.as_materialized_series().sum::().unwrap(), 7); let df_left = df!( "a"=> ["a", "b", "a", "z"], @@ -153,15 +167,15 @@ fn test_join_with_nulls() { let dts = &[20, 21, 22, 23, 24, 25, 27, 28]; let vals = &[1.2, 2.4, 4.67, 5.8, 4.4, 3.6, 7.6, 6.5]; let df = DataFrame::new(vec![ - Series::new("date".into(), dts), - Series::new("val".into(), vals), + Column::new("date".into(), dts), + Column::new("val".into(), vals), ]) .unwrap(); let vals2 = &[Some(1.1), None, Some(3.3), None, None]; let df2 = DataFrame::new(vec![ - Series::new("date".into(), &dts[3..]), - Series::new("val2".into(), vals2), + Column::new("date".into(), &dts[3..]), + Column::new("val2".into(), vals2), ]) .unwrap(); @@ -338,14 +352,14 @@ fn test_join_categorical() { fn test_empty_df_join() -> PolarsResult<()> { let empty: Vec = vec![]; let empty_df = DataFrame::new(vec![ - Series::new("key".into(), &empty), - Series::new("eval".into(), &empty), + Column::new("key".into(), &empty), + Column::new("eval".into(), &empty), ]) .unwrap(); let df = DataFrame::new(vec![ - Series::new("key".into(), &["foo"]), - Series::new("aval".into(), &[4]), + Column::new("key".into(), &["foo"]), + Column::new("aval".into(), &[4]), ]) .unwrap(); @@ -361,8 +375,8 @@ fn test_empty_df_join() -> PolarsResult<()> { let empty: Vec = vec![]; let _empty_df = DataFrame::new(vec![ - Series::new("key".into(), &empty), - Series::new("eval".into(), &empty), + Column::new("key".into(), &empty), + Column::new("eval".into(), &empty), ]) .unwrap(); @@ -374,9 +388,9 @@ fn test_empty_df_join() -> PolarsResult<()> { // https://github.com/pola-rs/polars/issues/1824 let empty: Vec = vec![]; let empty_df = DataFrame::new(vec![ - Series::new("key".into(), &empty), - Series::new("1val".into(), &empty), - Series::new("2val".into(), &empty), + Column::new("key".into(), &empty), + Column::new("1val".into(), &empty), + Column::new("2val".into(), &empty), ])?; let out = df.left_join(&empty_df, ["key"], ["key"])?; @@ -610,7 +624,7 @@ fn test_4_threads_bit_offset() -> PolarsResult<()> { .collect::(); left_a.rename("a".into()); left_b.rename("b".into()); - let left_df = DataFrame::new(vec![left_a.into_series(), left_b.into_series()])?; + let left_df = DataFrame::new(vec![left_a.into_column(), left_b.into_column()])?; let i = 1; let len = 8; @@ -622,7 +636,7 @@ fn test_4_threads_bit_offset() -> PolarsResult<()> { right_a.rename("a".into()); right_b.rename("b".into()); - let right_df = DataFrame::new(vec![right_a.into_series(), right_b.into_series()])?; + let right_df = DataFrame::new(vec![right_a.into_column(), right_b.into_column()])?; let out = JoinBuilder::new(left_df.lazy()) .with(right_df.lazy()) .on([col("a"), col("b")]) diff --git a/crates/polars/tests/it/core/pivot.rs b/crates/polars/tests/it/core/pivot.rs index 85cf69ec1494..51367d2b7e42 100644 --- a/crates/polars/tests/it/core/pivot.rs +++ b/crates/polars/tests/it/core/pivot.rs @@ -56,9 +56,9 @@ fn test_pivot_date_() -> PolarsResult<()> { #[test] fn test_pivot_old() { - let s0 = Series::new("index".into(), ["A", "A", "B", "B", "C"].as_ref()); - let s2 = Series::new("columns".into(), ["k", "l", "m", "m", "l"].as_ref()); - let s1 = Series::new("values".into(), [1, 2, 2, 4, 2].as_ref()); + let s0 = Column::new("index".into(), ["A", "A", "B", "B", "C"].as_ref()); + let s2 = Column::new("columns".into(), ["k", "l", "m", "m", "l"].as_ref()); + let s1 = Column::new("values".into(), [1, 2, 2, 4, 2].as_ref()); let df = DataFrame::new(vec![s0, s1, s2]).unwrap(); let pvt = pivot( diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs index 7c08998e69af..b1285d5710b6 100644 --- a/crates/polars/tests/it/io/csv.rs +++ b/crates/polars/tests/it/io/csv.rs @@ -44,15 +44,15 @@ fn write_csv() { fn write_dates() { use polars_core::export::chrono; - let s0 = Series::new( + let s0 = Column::new( "date".into(), [chrono::NaiveDate::from_yo_opt(2024, 33), None], ); - let s1 = Series::new( + let s1 = Column::new( "time".into(), [None, chrono::NaiveTime::from_hms_opt(19, 50, 0)], ); - let s2 = Series::new( + let s2 = Column::new( "datetime".into(), [ Some(chrono::NaiveDateTime::new( @@ -122,7 +122,7 @@ fn write_dates() { NonExistent::Raise, ) .unwrap() - .into_series(); + .into_column(); let mut with_timezone_df = DataFrame::new(vec![with_timezone]).unwrap(); buf.clear(); CsvWriter::new(&mut buf) @@ -150,7 +150,7 @@ fn test_read_csv_filter() -> PolarsResult<()> { .try_into_reader_with_file_path(Some(FOODS_CSV.into()))? .finish()?; - let out = df.filter(&df.column("fats_g")?.gt(4)?)?; + let out = df.filter(&df.column("fats_g")?.as_materialized_series().gt(4)?)?; // This fails if all columns are not equal. println!("{out}"); @@ -221,7 +221,7 @@ fn test_parser() -> PolarsResult<()> { assert_eq!(col.get(2)?, AnyValue::String("Setosa")); assert_eq!("sepal_length", df.get_columns()[0].name().as_str()); - assert_eq!(1, df.column("sepal_length").unwrap().chunks().len()); + assert_eq!(1, df.column("sepal_length").unwrap().as_materialized_series().chunks().len()); assert_eq!(df.height(), 7); // test windows line endings @@ -309,15 +309,15 @@ fn test_missing_data() { assert!(df .column("column_1") .unwrap() - .equals(&Series::new("column_1".into(), &[1_i64, 1]))); + .equals(&Column::new("column_1".into(), &[1_i64, 1]))); assert!(df .column("column_2") .unwrap() - .equals_missing(&Series::new("column_2".into(), &[Some(2_i64), None]))); + .equals_missing(&Column::new("column_2".into(), &[Some(2_i64), None]))); assert!(df .column("column_3") .unwrap() - .equals(&Series::new("column_3".into(), &[3_i64, 3]))); + .equals(&Column::new("column_3".into(), &[3_i64, 3]))); } #[test] @@ -332,7 +332,7 @@ fn test_escape_comma() { assert!(df .column("column_3") .unwrap() - .equals(&Series::new("column_3".into(), &[11_i64, 12]))); + .equals(&Column::new("column_3".into(), &[11_i64, 12]))); } #[test] @@ -344,7 +344,7 @@ fn test_escape_double_quotes() { let file = Cursor::new(csv); let df = CsvReader::new(file).finish().unwrap(); assert_eq!(df.shape(), (2, 3)); - assert!(df.column("column_2").unwrap().equals(&Series::new( + assert!(df.column("column_2").unwrap().equals(&Column::new( "column_2".into(), &[ r#"with "double quotes" US"#, @@ -403,7 +403,7 @@ hello,","," ",world,"!" assert!(df .column(col) .unwrap() - .equals(&Series::new(col.into(), &[val; 4]))); + .equals(&Column::new(col.into(), &[val; 4]))); } } @@ -425,7 +425,7 @@ versions of Lorem Ipsum.",11 .finish() .unwrap(); - assert!(df.column("column_2").unwrap().equals(&Series::new( + assert!(df.column("column_2").unwrap().equals(&Column::new( "column_2".into(), &[ r#"Lorem Ipsum is simply dummy text of the printing and typesetting diff --git a/crates/polars/tests/it/io/ipc.rs b/crates/polars/tests/it/io/ipc.rs index 8a5602c86051..959886e33b72 100644 --- a/crates/polars/tests/it/io/ipc.rs +++ b/crates/polars/tests/it/io/ipc.rs @@ -24,8 +24,8 @@ fn test_ipc_compression_variadic_buffers() { #[cfg(test)] pub(crate) fn create_df() -> DataFrame { - let s0 = Series::new("days".into(), [0, 1, 2, 3, 4].as_ref()); - let s1 = Series::new("temp".into(), [22.1, 19.9, 7., 2., 3.].as_ref()); + let s0 = Column::new("days".into(), [0, 1, 2, 3, 4].as_ref()); + let s1 = Column::new("temp".into(), [22.1, 19.9, 7., 2., 3.].as_ref()); DataFrame::new(vec![s0, s1]).unwrap() } @@ -141,7 +141,7 @@ fn test_write_with_compression() { fn write_and_read_ipc_empty_series() { let mut buf: Cursor> = Cursor::new(Vec::new()); let chunked_array = Float64Chunked::new("empty".into(), &[0_f64; 0]); - let mut df = DataFrame::new(vec![chunked_array.into_series()]).unwrap(); + let mut df = DataFrame::new(vec![chunked_array.into_column()]).unwrap(); IpcWriter::new(&mut buf) .finish(&mut df) .expect("ipc writer"); diff --git a/crates/polars/tests/it/io/ipc_stream.rs b/crates/polars/tests/it/io/ipc_stream.rs index d12082d0dd71..770e0d88faec 100644 --- a/crates/polars/tests/it/io/ipc_stream.rs +++ b/crates/polars/tests/it/io/ipc_stream.rs @@ -146,7 +146,7 @@ mod test { fn write_and_read_ipc_stream_empty_series() { fn df() -> DataFrame { DataFrame::new(vec![ - Float64Chunked::new("empty".into(), &[0_f64; 0]).into_series() + Float64Chunked::new("empty".into(), &[0_f64; 0]).into_column() ]) .unwrap() } diff --git a/crates/polars/tests/it/io/mod.rs b/crates/polars/tests/it/io/mod.rs index 2fd9aab899d1..6ea615799996 100644 --- a/crates/polars/tests/it/io/mod.rs +++ b/crates/polars/tests/it/io/mod.rs @@ -17,7 +17,7 @@ mod ipc_stream; use polars::prelude::*; pub(crate) fn create_df() -> DataFrame { - let s0 = Series::new("days".into(), [0, 1, 2, 3, 4].as_ref()); - let s1 = Series::new("temp".into(), [22.1, 19.9, 7., 2., 3.].as_ref()); + let s0 = Column::new("days".into(), [0, 1, 2, 3, 4].as_ref()); + let s1 = Column::new("temp".into(), [22.1, 19.9, 7., 2., 3.].as_ref()); DataFrame::new(vec![s0, s1]).unwrap() } diff --git a/crates/polars/tests/it/joins.rs b/crates/polars/tests/it/joins.rs index 0fa0ba1c66a9..19e4911df3a9 100644 --- a/crates/polars/tests/it/joins.rs +++ b/crates/polars/tests/it/joins.rs @@ -36,14 +36,14 @@ fn join_nans_outer() -> PolarsResult<()> { #[test] #[cfg(feature = "lazy")] fn join_empty_datasets() -> PolarsResult<()> { - let a = DataFrame::new(Vec::from([Series::new_empty( + let a = DataFrame::new(Vec::from([Column::new_empty( "foo".into(), &DataType::Int64, )])) .unwrap(); let b = DataFrame::new(Vec::from([ - Series::new_empty("foo".into(), &DataType::Int64), - Series::new_empty("bar".into(), &DataType::Int64), + Column::new_empty("foo".into(), &DataType::Int64), + Column::new_empty("bar".into(), &DataType::Int64), ])) .unwrap(); diff --git a/crates/polars/tests/it/lazy/aggregation.rs b/crates/polars/tests/it/lazy/aggregation.rs index ad433e139775..85ded9c742d0 100644 --- a/crates/polars/tests/it/lazy/aggregation.rs +++ b/crates/polars/tests/it/lazy/aggregation.rs @@ -14,9 +14,9 @@ fn test_lazy_agg() { ], "%Y-%m-%d", ) - .into_series(); - let s1 = Series::new("temp".into(), [20, 10, 7, 9, 1].as_ref()); - let s2 = Series::new("rain".into(), [0.2, 0.1, 0.3, 0.1, 0.01].as_ref()); + .into_column(); + let s1 = Column::new("temp".into(), [20, 10, 7, 9, 1].as_ref()); + let s2 = Column::new("rain".into(), [0.2, 0.1, 0.3, 0.1, 0.01].as_ref()); let df = DataFrame::new(vec![s0, s1, s2]).unwrap(); let lf = df @@ -33,7 +33,7 @@ fn test_lazy_agg() { let new = lf.collect().unwrap(); let min = new.column("min").unwrap(); - assert_eq!(min, &Series::new("min".into(), [0.1f64, 0.01, 0.1])); + assert_eq!(min, &Column::new("min".into(), [0.1f64, 0.01, 0.1])); } #[test] diff --git a/crates/polars/tests/it/lazy/cwc.rs b/crates/polars/tests/it/lazy/cwc.rs index 2ad0ab11ede4..5be002410391 100644 --- a/crates/polars/tests/it/lazy/cwc.rs +++ b/crates/polars/tests/it/lazy/cwc.rs @@ -59,7 +59,7 @@ fn fuzz_cluster_with_columns() { let mut unused_cols: Vec = Vec::with_capacity(26); let mut used_cols: Vec = Vec::with_capacity(26); - let mut series: Vec = Vec::with_capacity(*NUM_ORIGINAL_COLS.end()); + let mut columns: Vec = Vec::with_capacity(*NUM_ORIGINAL_COLS.end()); let mut used: Vec = Vec::with_capacity(26); @@ -76,11 +76,11 @@ fn fuzz_cluster_with_columns() { let column = rng.gen_range(0..unused_cols.len()); let column = unused_cols.swap_remove(column); - series.push(Series::new(to_str!(column).into(), vec![rnd_prime(rng)])); + columns.push(Column::new(to_str!(column).into(), vec![rnd_prime(rng)])); used_cols.push(column); } - let mut lf = DataFrame::new(std::mem::take(&mut series)).unwrap().lazy(); + let mut lf = DataFrame::new(std::mem::take(&mut columns)).unwrap().lazy(); for _ in 0..num_with_columns { let num_exprs = rng.gen_range(0..8); diff --git a/crates/polars/tests/it/lazy/expressions/arity.rs b/crates/polars/tests/it/lazy/expressions/arity.rs index 52ac97c56e62..56bfa432f8ce 100644 --- a/crates/polars/tests/it/lazy/expressions/arity.rs +++ b/crates/polars/tests/it/lazy/expressions/arity.rs @@ -58,7 +58,7 @@ fn includes_null_predicate_3038() -> PolarsResult<()> { s.str()? .to_lowercase() .contains("not_exist", true) - .map(|ca| Some(ca.into_series())) + .map(|ca| Some(ca.into_column())) }, GetOutput::from_type(DataType::Boolean), )) @@ -88,7 +88,7 @@ fn includes_null_predicate_3038() -> PolarsResult<()> { s.str()? .to_lowercase() .contains_literal("non-existent") - .map(|ca| Some(ca.into_series())) + .map(|ca| Some(ca.into_column())) }, GetOutput::from_type(DataType::Boolean), )) diff --git a/crates/polars/tests/it/lazy/expressions/window.rs b/crates/polars/tests/it/lazy/expressions/window.rs index d617dd46574a..21d8a3d26bf7 100644 --- a/crates/polars/tests/it/lazy/expressions/window.rs +++ b/crates/polars/tests/it/lazy/expressions/window.rs @@ -217,7 +217,7 @@ fn test_window_mapping() -> PolarsResult<()> { .select([(lit(10) + col("A")).alias("foo").over([col("fruits")])]) .collect()?; - let expected = Series::new("foo".into(), [11, 12, 13, 14, 15]); + let expected = Column::new("foo".into(), [11, 12, 13, 14, 15]); assert!(out.column("foo")?.equals(&expected)); let out = df @@ -232,7 +232,7 @@ fn test_window_mapping() -> PolarsResult<()> { .over([col("fruits")]), ]) .collect()?; - let expected = Series::new("foo".into(), [11, 12, 8, 9, 15]); + let expected = Column::new("foo".into(), [11, 12, 8, 9, 15]); assert!(out.column("foo")?.equals(&expected)); let out = df @@ -247,7 +247,7 @@ fn test_window_mapping() -> PolarsResult<()> { .over([col("fruits")]), ]) .collect()?; - let expected = Series::new("foo".into(), [None, Some(3), None, Some(-1), Some(-1)]); + let expected = Column::new("foo".into(), [None, Some(3), None, Some(-1), Some(-1)]); assert!(out.column("foo")?.equals_missing(&expected)); // now sorted @@ -259,7 +259,7 @@ fn test_window_mapping() -> PolarsResult<()> { .lazy() .select([(lit(10) + col("A")).alias("foo").over([col("fruits")])]) .collect()?; - let expected = Series::new("foo".into(), [13, 14, 11, 12, 15]); + let expected = Column::new("foo".into(), [13, 14, 11, 12, 15]); assert!(out.column("foo")?.equals(&expected)); let out = df @@ -275,7 +275,7 @@ fn test_window_mapping() -> PolarsResult<()> { ]) .collect()?; - let expected = Series::new("foo".into(), [8, 9, 11, 12, 15]); + let expected = Column::new("foo".into(), [8, 9, 11, 12, 15]); assert!(out.column("foo")?.equals(&expected)); let out = df @@ -290,7 +290,7 @@ fn test_window_mapping() -> PolarsResult<()> { ]) .collect()?; - let expected = Series::new("foo".into(), [None, Some(-1), None, Some(3), Some(-1)]); + let expected = Column::new("foo".into(), [None, Some(-1), None, Some(3), Some(-1)]); assert!(out.column("foo")?.equals_missing(&expected)); Ok(()) @@ -381,7 +381,7 @@ fn test_window_naive_any() -> PolarsResult<()> { .collect()?; let res = df.column("res")?; - assert_eq!(res.sum::().unwrap(), 5); + assert_eq!(res.as_materialized_series().sum::().unwrap(), 5); Ok(()) } diff --git a/crates/polars/tests/it/lazy/exprs.rs b/crates/polars/tests/it/lazy/exprs.rs index 45d550ae85a1..84dfb7ade3cf 100644 --- a/crates/polars/tests/it/lazy/exprs.rs +++ b/crates/polars/tests/it/lazy/exprs.rs @@ -7,9 +7,9 @@ fn fuzz_exprs() { use rand::Rng; let lf = DataFrame::new(vec![ - Series::new("A".into(), vec![1, 2, 3, 4, 5]), - Series::new("B".into(), vec![Some(5), Some(4), None, Some(2), Some(1)]), - Series::new( + Column::new("A".into(), vec![1, 2, 3, 4, 5]), + Column::new("B".into(), vec![Some(5), Some(4), None, Some(2), Some(1)]), + Column::new( "C".into(), vec!["str", "", "a quite long string", "my", "string"], ), @@ -17,9 +17,9 @@ fn fuzz_exprs() { .unwrap() .lazy(); let empty = DataFrame::new(vec![ - Series::new("A".into(), Vec::::new()), - Series::new("B".into(), Vec::::new()), - Series::new("C".into(), Vec::<&str>::new()), + Column::new("A".into(), Vec::::new()), + Column::new("B".into(), Vec::::new()), + Column::new("C".into(), Vec::<&str>::new()), ]) .unwrap() .lazy(); diff --git a/crates/polars/tests/it/lazy/group_by.rs b/crates/polars/tests/it/lazy/group_by.rs index ac76e4921e40..cbae14aca5f0 100644 --- a/crates/polars/tests/it/lazy/group_by.rs +++ b/crates/polars/tests/it/lazy/group_by.rs @@ -79,7 +79,7 @@ fn test_filter_diff_arithmetic() -> PolarsResult<()> { let out = out.column("diff")?; assert_eq!( out, - &Series::new("diff".into(), &[None, Some(26), Some(6), None]) + &Column::new("diff".into(), &[None, Some(26), Some(6), None]) ); Ok(()) @@ -123,7 +123,7 @@ fn test_group_by_agg_list_with_not_aggregated() -> PolarsResult<()> { let out = out.explode()?; assert_eq!( out, - Series::new("value".into(), &[0, 2, 1, 3, 2, 2, 7, 2, 3, 1, 2, 1]) + Column::new("value".into(), &[0, 2, 1, 3, 2, 2, 7, 2, 3, 1, 2, 1]) ); Ok(()) } diff --git a/crates/polars/tests/it/lazy/queries.rs b/crates/polars/tests/it/lazy/queries.rs index 0be10b20f60e..f140a0461639 100644 --- a/crates/polars/tests/it/lazy/queries.rs +++ b/crates/polars/tests/it/lazy/queries.rs @@ -7,7 +7,7 @@ fn test_with_duplicate_column_empty_df() { let a = Int32Chunked::from_slice("a".into(), &[]); assert_eq!( - DataFrame::new(vec![a.into_series()]) + DataFrame::new(vec![a.into_column()]) .unwrap() .lazy() .with_columns([lit(true).alias("a")]) @@ -195,7 +195,7 @@ fn test_unknown_supertype_ignore() -> PolarsResult<()> { fn test_apply_multiple_columns() -> PolarsResult<()> { let df = fruits_cars(); - let multiply = |s: &mut [Series]| (&(&s[0] * &s[0])? * &s[1]).map(Some); + let multiply = |s: &mut [Column]| (&(&s[0] * &s[0])? * &s[1]).map(Some); let out = df .clone() @@ -234,14 +234,14 @@ fn test_apply_multiple_columns() -> PolarsResult<()> { #[test] fn test_group_by_on_lists() -> PolarsResult<()> { - let s0 = Series::new("".into(), [1i32, 2, 3]); - let s1 = Series::new("groups".into(), [4i32, 5]); + let s0 = Column::new("".into(), [1i32, 2, 3]); + let s1 = Column::new("groups".into(), [4i32, 5]); let mut builder = ListPrimitiveChunkedBuilder::::new("arrays".into(), 10, 10, DataType::Int32); - builder.append_series(&s0).unwrap(); - builder.append_series(&s1).unwrap(); - let s2 = builder.finish().into_series(); + builder.append_series(s0.as_materialized_series()).unwrap(); + builder.append_series(s1.as_materialized_series()).unwrap(); + let s2 = builder.finish().into_column(); let df = DataFrame::new(vec![s1, s2])?; let out = df From 52405af19ea78924c9b25b740bc79c5c7f1dc877 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 16:22:59 +0200 Subject: [PATCH 12/42] fix failed rebase --- .../polars-mem-engine/src/executors/stack.rs | 20 ++----------------- 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/crates/polars-mem-engine/src/executors/stack.rs b/crates/polars-mem-engine/src/executors/stack.rs index fa8240d0a4cc..2b59fbc2535d 100644 --- a/crates/polars-mem-engine/src/executors/stack.rs +++ b/crates/polars-mem-engine/src/executors/stack.rs @@ -64,7 +64,7 @@ impl StackExec { // new, unique column names. It is immediately // followed by a projection which pulls out the // possibly mismatching column lengths. - unsafe { df.get_columns_mut().extend(res) }; + unsafe { df.get_columns_mut() } .extend(res.into_iter().map(Column::from)); } else { let height = df.height(); @@ -85,24 +85,8 @@ impl StackExec { c.name(), len, height ); } -======= - if !self.options.should_broadcast { - debug_assert!( - res.iter() - .all(|column| column.name().starts_with("__POLARS_CSER_0x")), - "non-broadcasting hstack should only be used for CSE columns" - ); - // Safety: this case only appears as a result of - // CSE optimization, and the usage there produces - // new, unique column names. It is immediately - // followed by a projection which pulls out the - // possibly mismatching column lengths. - unsafe { df.get_columns_mut().extend(res.into_iter().map(Column::from)) }; - } else { - df._add_series(res, schema)?; ->>>>>>> e774e00d2f (finish polars-pipe) } - df._add_columns(res, schema)?; + df._add_series(res, schema)?; } df }; From 5faaaa9cf513593933d7fadb6747b524f00c7612 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 16:52:44 +0200 Subject: [PATCH 13/42] clippy and format --- crates/polars-core/src/frame/column.rs | 64 +++++++++++++++++-- crates/polars-core/src/frame/group_by/mod.rs | 28 +++++--- crates/polars-core/src/frame/mod.rs | 8 +-- crates/polars-core/src/prelude.rs | 2 +- crates/polars-core/src/scalar/mod.rs | 2 +- crates/polars-core/src/serde/df.rs | 2 +- crates/polars-core/src/series/mod.rs | 1 + crates/polars-core/src/series/series_trait.rs | 16 +++++ crates/polars-io/src/csv/read/read_impl.rs | 2 +- .../polars-io/src/parquet/read/read_impl.rs | 9 ++- crates/polars-lazy/src/tests/arity.rs | 9 ++- crates/polars-lazy/src/tests/queries.rs | 20 +++++- .../polars-mem-engine/src/executors/stack.rs | 2 +- crates/polars-ops/src/chunked_array/top_k.rs | 7 +- crates/polars-ops/src/frame/join/mod.rs | 4 +- .../polars-ops/src/frame/pivot/positioning.rs | 4 -- crates/polars-ops/src/series/ops/cut.rs | 22 +++---- crates/polars-ops/src/series/ops/fused.rs | 3 +- crates/polars-ops/src/series/ops/replace.rs | 7 +- crates/polars-ops/src/series/ops/rle.rs | 10 ++- .../src/executors/operators/projection.rs | 7 +- .../src/executors/sinks/group_by/string.rs | 6 +- .../src/dsl/function_expr/array.rs | 12 ++-- .../polars-plan/src/dsl/function_expr/clip.rs | 14 ++-- .../src/dsl/function_expr/correlation.rs | 34 ++++++---- .../polars-plan/src/dsl/function_expr/cut.rs | 37 +++++++++++ .../src/dsl/function_expr/datetime.rs | 5 +- .../src/dsl/function_expr/dispatch.rs | 25 ++++---- .../polars-plan/src/dsl/function_expr/list.rs | 13 ++-- .../polars-plan/src/dsl/function_expr/mod.rs | 5 +- .../src/dsl/function_expr/rolling_by.rs | 14 ++-- .../src/dsl/function_expr/shrink_type.rs | 3 +- .../src/dsl/function_expr/strings.rs | 6 +- .../src/dsl/function_expr/trigonometry.rs | 4 +- crates/polars-plan/src/dsl/mod.rs | 2 +- crates/polars-plan/src/dsl/python_udf.rs | 2 +- crates/polars-plan/src/dsl/udf.rs | 2 +- .../src/interop/numpy/to_numpy_df.rs | 13 +++- crates/polars-time/src/group_by/dynamic.rs | 5 +- crates/polars/tests/it/core/joins.rs | 9 ++- crates/polars/tests/it/io/csv.rs | 9 ++- docs/src/rust/user-guide/expressions/lists.rs | 4 +- 42 files changed, 321 insertions(+), 132 deletions(-) create mode 100644 crates/polars-plan/src/dsl/function_expr/cut.rs diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index 71344b3e9005..ecad8a22aab6 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -45,7 +45,7 @@ impl Column { #[inline] pub fn new_empty(name: PlSmallStr, dtype: &DataType) -> Self { // @scalar-opt - Self::Series(Series::new_empty(name, &dtype)) + Self::Series(Series::new_empty(name, dtype)) } #[inline] @@ -266,41 +266,69 @@ impl Column { self.as_materialized_series().null_count() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_min(&self, groups: &GroupsProxy) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_min(groups) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_max(&self, groups: &GroupsProxy) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_max(groups) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_mean(&self, groups: &GroupsProxy) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_mean(groups) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_sum(&self, groups: &GroupsProxy) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_sum(groups) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. pub unsafe fn agg_first(&self, groups: &GroupsProxy) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_first(groups) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. pub unsafe fn agg_last(&self, groups: &GroupsProxy) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_last(groups) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. pub unsafe fn agg_n_unique(&self, groups: &GroupsProxy) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_n_unique(groups) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. pub unsafe fn agg_quantile( &self, groups: &GroupsProxy, @@ -315,21 +343,37 @@ impl Column { .into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_median(&self, groups: &GroupsProxy) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_median(groups) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_var(&self, groups: &GroupsProxy, ddof: u8) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_var(groups, ddof) }.into() } - pub unsafe fn agg_std(&self, groups: &GroupsProxy, ddof: u8) -> Self { + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] + pub(crate) unsafe fn agg_std(&self, groups: &GroupsProxy, ddof: u8) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_std(groups, ddof) }.into() } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] pub unsafe fn agg_list(&self, groups: &GroupsProxy) -> Self { // @scalar-opt unsafe { self.as_materialized_series().agg_list(groups) }.into() @@ -398,6 +442,9 @@ impl Column { .vec_hash_combine(build_hasher, hashes) } + /// # Safety + /// + /// Indexes need to be in bounds. pub(crate) unsafe fn equal_element( &self, idx_self: usize, @@ -524,6 +571,9 @@ impl Column { .map(Column::from) } + /// # Safety + /// + /// This can lead to invalid memory access in downstream code. pub unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult { // @scalar-opt unsafe { self.as_materialized_series().cast_unchecked(dtype) }.map(Column::from) @@ -709,6 +759,9 @@ impl Column { self.as_materialized_series().phys_iter() } + /// # Safety + /// + /// Does not perform bounds check on `index` pub unsafe fn get_unchecked(&self, index: usize) -> AnyValue { // @scalar-opt self.as_materialized_series().get_unchecked(index) @@ -1008,10 +1061,9 @@ impl From for _SerdeSeries { } } -impl Into for _SerdeSeries { +impl From<_SerdeSeries> for Series { #[inline] - fn into(self) -> Series { - self.0 + fn from(value: _SerdeSeries) -> Self { + value.0 } } - diff --git a/crates/polars-core/src/frame/group_by/mod.rs b/crates/polars-core/src/frame/group_by/mod.rs index 322c6b967e33..e2fbb90d6e74 100644 --- a/crates/polars-core/src/frame/group_by/mod.rs +++ b/crates/polars-core/src/frame/group_by/mod.rs @@ -55,7 +55,9 @@ impl DataFrame { let groups = if by.len() == 1 { let column = &by[0]; - column.as_materialized_series().group_tuples(multithreaded, sorted) + column + .as_materialized_series() + .group_tuples(multithreaded, sorted) } else if by.iter().any(|s| s.dtype().is_object()) { #[cfg(feature = "object")] { @@ -294,7 +296,7 @@ impl<'df> GroupBy<'df> { }, } }) - .map(|s| Column::from(s)) + .map(Column::from) .collect() }) } @@ -396,7 +398,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Sum); let mut agg = unsafe { agg_col.agg_sum(&self.groups) }; agg.rename(new_name); - cols.push(agg.into()); + cols.push(agg); } DataFrame::new(cols) } @@ -433,7 +435,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Min); let mut agg = unsafe { agg_col.agg_min(&self.groups) }; agg.rename(new_name); - cols.push(agg.into()); + cols.push(agg); } DataFrame::new(cols) } @@ -470,7 +472,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Max); let mut agg = unsafe { agg_col.agg_max(&self.groups) }; agg.rename(new_name); - cols.push(agg.into()); + cols.push(agg); } DataFrame::new(cols) } @@ -507,7 +509,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::First); let mut agg = unsafe { agg_col.agg_first(&self.groups) }; agg.rename(new_name); - cols.push(agg.into()); + cols.push(agg); } DataFrame::new(cols) } @@ -544,7 +546,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Last); let mut agg = unsafe { agg_col.agg_last(&self.groups) }; agg.rename(new_name); - cols.push(agg.into()); + cols.push(agg); } DataFrame::new(cols) } @@ -581,7 +583,7 @@ impl<'df> GroupBy<'df> { let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::NUnique); let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) }; agg.rename(new_name); - cols.push(agg.into()); + cols.push(agg); } DataFrame::new(cols) } @@ -616,7 +618,7 @@ impl<'df> GroupBy<'df> { ); let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, interpol) }; agg.rename(new_name); - cols.push(agg.into()); + cols.push(agg); } DataFrame::new(cols) } @@ -1124,7 +1126,13 @@ mod test { .unwrap(); assert_eq!( - Vec::from(res.column("bar_sum").unwrap().as_materialized_series().i32().unwrap()), + Vec::from( + res.column("bar_sum") + .unwrap() + .as_materialized_series() + .i32() + .unwrap() + ), &[Some(2), Some(2), Some(1)] ); } diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index c245484089d4..0edbfa7a726e 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -174,7 +174,7 @@ pub struct DataFrame { } impl DataFrame { - pub fn materialized_column_iter(&self) -> impl Iterator + ExactSizeIterator { + pub fn materialized_column_iter(&self) -> impl ExactSizeIterator { self.columns.iter().map(Column::as_materialized_series) } @@ -630,7 +630,7 @@ impl DataFrame { /// assert_eq!(iterator.next(), None); /// # Ok::<(), PolarsError>(()) /// ``` - pub fn iter(&self) -> impl Iterator + ExactSizeIterator { + pub fn iter(&self) -> impl ExactSizeIterator { self.materialized_column_iter() } @@ -2505,8 +2505,8 @@ impl DataFrame { self.columns[0].clone().as_materialized_series().clone(), )), 2 => min_fn( - &self.columns[0].as_materialized_series(), - &self.columns[1].as_materialized_series(), + self.columns[0].as_materialized_series(), + self.columns[1].as_materialized_series(), ) .map(Some), _ => { diff --git a/crates/polars-core/src/prelude.rs b/crates/polars-core/src/prelude.rs index 2d729261c287..bd1ade2d9b90 100644 --- a/crates/polars-core/src/prelude.rs +++ b/crates/polars-core/src/prelude.rs @@ -40,13 +40,13 @@ pub use crate::datatypes::{ArrayCollectIterExt, *}; pub use crate::error::{ polars_bail, polars_ensure, polars_err, polars_warn, PolarsError, PolarsResult, }; +pub use crate::frame::column::{Column, IntoColumn}; pub use crate::frame::explode::UnpivotArgsIR; #[cfg(feature = "algorithm_group_by")] pub(crate) use crate::frame::group_by::aggregations::*; #[cfg(feature = "algorithm_group_by")] pub use crate::frame::group_by::*; pub use crate::frame::{DataFrame, UniqueKeepStrategy}; -pub use crate::frame::column::{Column, IntoColumn}; pub use crate::hashing::VecHash; pub use crate::named_from::{NamedFrom, NamedFromOwned}; pub use crate::scalar::Scalar; diff --git a/crates/polars-core/src/scalar/mod.rs b/crates/polars-core/src/scalar/mod.rs index aead770ca37c..d0948e8ff761 100644 --- a/crates/polars-core/src/scalar/mod.rs +++ b/crates/polars-core/src/scalar/mod.rs @@ -5,7 +5,7 @@ use polars_utils::pl_str::PlSmallStr; use serde::{Deserialize, Serialize}; use crate::datatypes::{AnyValue, DataType}; -use crate::prelude::{Column, Series, IntoColumn}; +use crate::prelude::{Column, IntoColumn, Series}; #[derive(Clone, Debug, PartialEq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/crates/polars-core/src/serde/df.rs b/crates/polars-core/src/serde/df.rs index 677f455d552a..52d6a0ee6eae 100644 --- a/crates/polars-core/src/serde/df.rs +++ b/crates/polars-core/src/serde/df.rs @@ -2,7 +2,7 @@ use polars_error::PolarsError; use serde::de::Error; use serde::*; -use crate::prelude::{DataFrame, Column}; +use crate::prelude::{Column, DataFrame}; // utility to ensure we serde to a struct // { diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index e54f7fcc98d6..2a18d7c7a6e9 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -463,6 +463,7 @@ impl Series { /// Cast from physical to logical types without any checks on the validity of the cast. /// /// # Safety + /// /// This can lead to invalid memory access in downstream code. pub unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult { match self.dtype() { diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs index 4e90f9d757d9..2804c5ce1840 100644 --- a/crates/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -105,10 +105,17 @@ pub(crate) mod private { ) -> PolarsResult<()> { polars_bail!(opq = vec_hash_combine, self._dtype()); } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] unsafe fn agg_min(&self, groups: &GroupsProxy) -> Series { Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] unsafe fn agg_max(&self, groups: &GroupsProxy) -> Series { Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) @@ -119,14 +126,23 @@ pub(crate) mod private { unsafe fn agg_sum(&self, groups: &GroupsProxy) -> Series { Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] unsafe fn agg_std(&self, groups: &GroupsProxy, _ddof: u8) -> Series { Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] unsafe fn agg_var(&self, groups: &GroupsProxy, _ddof: u8) -> Series { Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } + /// # Safety + /// + /// Does no bounds checks, groups must be correct. #[cfg(feature = "algorithm_group_by")] unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index c7d449b77ccb..bfe4e45fd286 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -64,7 +64,7 @@ pub(crate) fn cast_columns( (_, dt) => c.cast(dt), }?; if !ignore_errors && c.null_count() != out.null_count() { - handle_casting_failures(c.as_materialized_series(), &out.as_materialized_series())?; + handle_casting_failures(c.as_materialized_series(), out.as_materialized_series())?; } Ok(out) }; diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index e7b40e02b33f..693dcc1ea31c 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -518,7 +518,8 @@ fn rg_to_dfs_optionally_par_over_columns( Some(Filter::new_ranged(rg_slice.0, rg_slice.0 + rg_slice.1)), schema, store, - ).map(Column::from) + ) + .map(Column::from) }) .collect::>>() })? @@ -535,7 +536,8 @@ fn rg_to_dfs_optionally_par_over_columns( Some(Filter::new_ranged(rg_slice.0, rg_slice.0 + rg_slice.1)), schema, store, - ).map(Column::from) + ) + .map(Column::from) }) .collect::>>()? }; @@ -635,7 +637,8 @@ fn rg_to_dfs_par_over_rg( Some(Filter::new_ranged(slice.0, slice.0 + slice.1)), schema, store, - ).map(Column::from) + ) + .map(Column::from) }) .collect::>>()?; diff --git a/crates/polars-lazy/src/tests/arity.rs b/crates/polars-lazy/src/tests/arity.rs index 439d2a8be587..740678af0af4 100644 --- a/crates/polars-lazy/src/tests/arity.rs +++ b/crates/polars-lazy/src/tests/arity.rs @@ -72,5 +72,12 @@ fn test_lazy_ternary() { ) .collect() .unwrap(); - assert_eq!(43, df.column("new").unwrap().as_materialized_series().sum::().unwrap()); + assert_eq!( + 43, + df.column("new") + .unwrap() + .as_materialized_series() + .sum::() + .unwrap() + ); } diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index 5bdc32651860..4d482202cd67 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -232,7 +232,14 @@ fn test_lazy_binary_ops() { .select([col("a").eq(lit(2)).alias("foo")]) .collect() .unwrap(); - assert_eq!(new.column("foo").unwrap().as_materialized_series().sum::().unwrap(), 1); + assert_eq!( + new.column("foo") + .unwrap() + .as_materialized_series() + .sum::() + .unwrap(), + 1 + ); } #[test] @@ -501,7 +508,8 @@ fn test_lazy_query_7() { ]; let data = vec![Some(1.), Some(2.), Some(3.), Some(4.), None, None]; let df = DataFrame::new(vec![ - DatetimeChunked::from_naive_datetime("date".into(), dates, TimeUnit::Nanoseconds).into_column(), + DatetimeChunked::from_naive_datetime("date".into(), dates, TimeUnit::Nanoseconds) + .into_column(), Column::new("data".into(), data), ]) .unwrap(); @@ -516,7 +524,13 @@ fn test_lazy_query_7() { )))) .collect() .unwrap(); - let a = out.column("shifted").unwrap().as_materialized_series().sum::().unwrap() - 7.0; + let a = out + .column("shifted") + .unwrap() + .as_materialized_series() + .sum::() + .unwrap() + - 7.0; assert!(a < 0.01 && a > -0.01); } diff --git a/crates/polars-mem-engine/src/executors/stack.rs b/crates/polars-mem-engine/src/executors/stack.rs index 2b59fbc2535d..43c884b1f563 100644 --- a/crates/polars-mem-engine/src/executors/stack.rs +++ b/crates/polars-mem-engine/src/executors/stack.rs @@ -64,7 +64,7 @@ impl StackExec { // new, unique column names. It is immediately // followed by a projection which pulls out the // possibly mismatching column lengths. - unsafe { df.get_columns_mut() } .extend(res.into_iter().map(Column::from)); + unsafe { df.get_columns_mut() }.extend(res.into_iter().map(Column::from)); } else { let height = df.height(); diff --git a/crates/polars-ops/src/chunked_array/top_k.rs b/crates/polars-ops/src/chunked_array/top_k.rs index c3bcd391e5af..9caf861b6cd9 100644 --- a/crates/polars-ops/src/chunked_array/top_k.rs +++ b/crates/polars-ops/src/chunked_array/top_k.rs @@ -216,7 +216,7 @@ pub fn top_k(s: &[Column], descending: bool) -> PolarsResult { #[cfg(feature = "dtype-struct")] DataType::Struct(_) => { // Fallback to more generic impl. - top_k_by_impl(k, src, &[src.clone().into()], vec![descending]) + top_k_by_impl(k, src, &[src.clone()], vec![descending]) }, _dt => { macro_rules! dispatch { @@ -289,6 +289,9 @@ fn top_k_by_impl( let idx = _arg_bottom_k(k, by, &mut sort_options)?; - let result = unsafe { src.as_materialized_series().take_unchecked(&idx.into_inner()) }; + let result = unsafe { + src.as_materialized_series() + .take_unchecked(&idx.into_inner()) + }; Ok(result.into()) } diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index cbfcec5ea3f1..81f4fe54e7e4 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -387,8 +387,8 @@ pub trait DataFrameJoinOps: IntoDf { #[cfg(feature = "semi_anti_join")] JoinType::Anti | JoinType::Semi => self._join_impl( other, - vec![lhs_keys.into()], - vec![rhs_keys.into()], + vec![lhs_keys], + vec![rhs_keys], args, _check_rechunk, _verbose, diff --git a/crates/polars-ops/src/frame/pivot/positioning.rs b/crates/polars-ops/src/frame/pivot/positioning.rs index 9bce6710a1db..7b19872a1bc3 100644 --- a/crates/polars-ops/src/frame/pivot/positioning.rs +++ b/crates/polars-ops/src/frame/pivot/positioning.rs @@ -254,7 +254,6 @@ pub(super) fn compute_col_idx( let ca: &ChunkedArray = column_agg_physical .as_materialized_series() .as_ref() - .as_ref() .as_ref(); compute_col_idx_numeric(ca) }, @@ -262,7 +261,6 @@ pub(super) fn compute_col_idx( let ca: &ChunkedArray = column_agg_physical .as_materialized_series() .as_ref() - .as_ref() .as_ref(); compute_col_idx_numeric(ca) }, @@ -429,7 +427,6 @@ pub(super) fn compute_row_idx( let ca: &ChunkedArray = index_agg_physical .as_materialized_series() .as_ref() - .as_ref() .as_ref(); compute_row_index(index, ca, count, index_s.dtype()) }, @@ -437,7 +434,6 @@ pub(super) fn compute_row_idx( let ca: &ChunkedArray = index_agg_physical .as_materialized_series() .as_ref() - .as_ref() .as_ref(); compute_row_index(index, ca, count, index_s.dtype()) }, diff --git a/crates/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs index 08d40e187781..cba643cf98e9 100644 --- a/crates/polars-ops/src/series/ops/cut.rs +++ b/crates/polars-ops/src/series/ops/cut.rs @@ -2,12 +2,12 @@ use polars_core::prelude::*; use polars_utils::format_pl_smallstr; fn map_cats( - s: &Column, + s: &Series, labels: &[PlSmallStr], sorted_breaks: &[f64], left_closed: bool, include_breaks: bool, -) -> PolarsResult { +) -> PolarsResult { let out_name = PlSmallStr::from_static("category"); // Create new categorical and pre-register labels for consistent categorical indexes. @@ -58,12 +58,12 @@ fn map_cats( }); let outvals = vec![ - brk_vals.finish().into_column(), + brk_vals.finish().into_series(), bld.finish() ._with_fast_unique(label_has_value.iter().all(bool::clone)) - .into_column(), + .into_series(), ]; - Ok(StructChunked::from_columns(out_name, &outvals)?.into_column()) + Ok(StructChunked::from_series(out_name, &outvals)?.into_series()) } else { Ok(bld .drain_iter_and_finish(s_iter.map(|opt| { @@ -74,7 +74,7 @@ fn map_cats( }) })) ._with_fast_unique(label_has_value.iter().all(bool::clone)) - .into_column()) + .into_series()) } } @@ -96,12 +96,12 @@ pub fn compute_labels(breaks: &[f64], left_closed: bool) -> PolarsResult, labels: Option>, left_closed: bool, include_breaks: bool, -) -> PolarsResult { +) -> PolarsResult { // Breaks must be sorted to cut inputs properly. polars_ensure!(!breaks.iter().any(|x| x.is_nan()), ComputeError: "breaks cannot be NaN"); breaks.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); @@ -122,13 +122,13 @@ pub fn cut( } pub fn qcut( - s: &Column, + s: &Series, probs: Vec, labels: Option>, left_closed: bool, allow_duplicates: bool, include_breaks: bool, -) -> PolarsResult { +) -> PolarsResult { polars_ensure!(!probs.iter().any(|x| x.is_nan()), ComputeError: "quantiles cannot be NaN"); if s.null_count() == s.len() { @@ -177,7 +177,7 @@ mod test { use super::map_cats; - let s = Column::new("x".into(), &[1, 2, 3, 4, 5]); + let s = Series::new("x".into(), &[1, 2, 3, 4, 5]); let labels = &["a", "b", "c"].map(PlSmallStr::from_static); let breaks = &[2.0, 4.0]; diff --git a/crates/polars-ops/src/series/ops/fused.rs b/crates/polars-ops/src/series/ops/fused.rs index a2b3215add95..8132eda7c22a 100644 --- a/crates/polars-ops/src/series/ops/fused.rs +++ b/crates/polars-ops/src/series/ops/fused.rs @@ -160,6 +160,7 @@ pub fn fms_columns(a: &Column, b: &Column, c: &Column) -> Column { } else { (&(a.as_materialized_series() * b.as_materialized_series()).unwrap() - c.as_materialized_series()) - .unwrap().into() + .unwrap() + .into() } } diff --git a/crates/polars-ops/src/series/ops/replace.rs b/crates/polars-ops/src/series/ops/replace.rs index 9dde258a86aa..f2d8f8128777 100644 --- a/crates/polars-ops/src/series/ops/replace.rs +++ b/crates/polars-ops/src/series/ops/replace.rs @@ -179,7 +179,10 @@ fn replace_by_multiple( }, )?; - let replaced = joined.column("__POLARS_REPLACE_NEW").unwrap().as_materialized_series(); + let replaced = joined + .column("__POLARS_REPLACE_NEW") + .unwrap() + .as_materialized_series(); if replaced.null_count() == 0 { return Ok(replaced.clone()); @@ -238,7 +241,7 @@ fn create_replacer(mut old: Series, mut new: Series, add_mask: bool) -> PolarsRe // @scalar-opt let mask = Column::new(PlSmallStr::from_static("__POLARS_REPLACE_MASK"), &[true]) .new_from_index(0, new.len()); - vec![old.into(), new.into(), mask.into()] + vec![old.into(), new.into(), mask] } else { vec![old.into(), new.into()] }; diff --git a/crates/polars-ops/src/series/ops/rle.rs b/crates/polars-ops/src/series/ops/rle.rs index 6df79825b706..9277913558a2 100644 --- a/crates/polars-ops/src/series/ops/rle.rs +++ b/crates/polars-ops/src/series/ops/rle.rs @@ -4,7 +4,9 @@ use polars_core::series::IsSorted; /// Get the lengths of runs of identical values. pub fn rle(s: &Column) -> PolarsResult { let (s1, s2) = (s.slice(0, s.len() - 1), s.slice(1, s.len())); - let s_neq = s1.as_materialized_series().not_equal_missing(s2.as_materialized_series())?; + let s_neq = s1 + .as_materialized_series() + .not_equal_missing(s2.as_materialized_series())?; let n_runs = s_neq.sum().ok_or_else(|| polars_err!(InvalidOperation: "could not evaluate 'rle_id' on series of dtype: {}", s.dtype()))? + 1; let mut lengths = Vec::::with_capacity(n_runs as usize); @@ -34,11 +36,13 @@ pub fn rle(s: &Column) -> PolarsResult { /// Similar to `rle`, but maps values to run IDs. pub fn rle_id(s: &Column) -> PolarsResult { - if s.len() == 0 { + if s.is_empty() { return Ok(Column::new_empty(s.name().clone(), &IDX_DTYPE)); } let (s1, s2) = (s.slice(0, s.len() - 1), s.slice(1, s.len())); - let s_neq = s1.as_materialized_series().not_equal_missing(s2.as_materialized_series())?; + let s_neq = s1 + .as_materialized_series() + .not_equal_missing(s2.as_materialized_series())?; let mut out = Vec::::with_capacity(s.len()); let mut last = 0; diff --git a/crates/polars-pipe/src/executors/operators/projection.rs b/crates/polars-pipe/src/executors/operators/projection.rs index f948fbe2e4ef..9ae6dbc5299d 100644 --- a/crates/polars-pipe/src/executors/operators/projection.rs +++ b/crates/polars-pipe/src/executors/operators/projection.rs @@ -1,9 +1,9 @@ use std::sync::Arc; use polars_core::error::PolarsResult; +use polars_core::frame::column::{Column, IntoColumn}; use polars_core::frame::DataFrame; use polars_core::schema::SchemaRef; -use polars_core::frame::column::{Column, IntoColumn}; use polars_plan::prelude::ProjectionOptions; use polars_utils::pl_str::PlSmallStr; @@ -118,7 +118,10 @@ impl Operator for HstackOperator { let projected = self .exprs .iter() - .map(|e| e.evaluate(chunk, &context.execution_state).map(Column::from)) + .map(|e| { + e.evaluate(chunk, &context.execution_state) + .map(Column::from) + }) .collect::>>()?; let columns = chunk.data.get_columns()[..width].to_vec(); diff --git a/crates/polars-pipe/src/executors/sinks/group_by/string.rs b/crates/polars-pipe/src/executors/sinks/group_by/string.rs index f16ca2e17bc3..d2fec9c16173 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/string.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/string.rs @@ -210,7 +210,11 @@ impl StringGroupbySink { let mut cols = Vec::with_capacity(1 + self.number_of_aggs()); cols.push(key_builder.finish().into_series().into_column()); - cols.extend(buffers.into_iter().map(|buf| buf.into_series().into_column())); + cols.extend( + buffers + .into_iter() + .map(|buf| buf.into_series().into_column()), + ); physical_agg_to_logical(&mut cols, &self.output_schema); Some(unsafe { DataFrame::new_no_checks(cols) }) }) diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index d1c92cba1df2..dce6d44bce94 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -221,9 +221,12 @@ pub(super) fn contains(s: &[Column]) -> PolarsResult { polars_ensure!(matches!(array.dtype(), DataType::Array(_, _)), SchemaMismatch: "invalid series dtype: expected `Array`, got `{}`", array.dtype(), ); - Ok(is_in(item.as_materialized_series(), array.as_materialized_series())? - .with_name(array.name().clone()) - .into_column()) + Ok(is_in( + item.as_materialized_series(), + array.as_materialized_series(), + )? + .with_name(array.name().clone()) + .into_column()) } #[cfg(feature = "array_count")] @@ -236,7 +239,8 @@ pub(super) fn count_matches(args: &[Column]) -> PolarsResult { element.len() ); let ca = s.array()?; - ca.array_count_matches(element.get(0).unwrap()).map(Column::from) + ca.array_count_matches(element.get(0).unwrap()) + .map(Column::from) } pub(super) fn shift(s: &[Column]) -> PolarsResult { diff --git a/crates/polars-plan/src/dsl/function_expr/clip.rs b/crates/polars-plan/src/dsl/function_expr/clip.rs index 4b537c811235..9a721d65d198 100644 --- a/crates/polars-plan/src/dsl/function_expr/clip.rs +++ b/crates/polars-plan/src/dsl/function_expr/clip.rs @@ -3,17 +3,17 @@ use super::*; pub(super) fn clip(s: &[Column], has_min: bool, has_max: bool) -> PolarsResult { match (has_min, has_max) { (true, true) => polars_ops::series::clip( - &s[0].as_materialized_series(), - &s[1].as_materialized_series(), - &s[2].as_materialized_series(), + s[0].as_materialized_series(), + s[1].as_materialized_series(), + s[2].as_materialized_series(), ), (true, false) => polars_ops::series::clip_min( - &s[0].as_materialized_series(), - &s[1].as_materialized_series(), + s[0].as_materialized_series(), + s[1].as_materialized_series(), ), (false, true) => polars_ops::series::clip_max( - &s[0].as_materialized_series(), - &s[1].as_materialized_series(), + s[0].as_materialized_series(), + s[1].as_materialized_series(), ), _ => unreachable!(), } diff --git a/crates/polars-plan/src/dsl/function_expr/correlation.rs b/crates/polars-plan/src/dsl/function_expr/correlation.rs index 14b3f0f77a4c..0413bac9dc01 100644 --- a/crates/polars-plan/src/dsl/function_expr/correlation.rs +++ b/crates/polars-plan/src/dsl/function_expr/correlation.rs @@ -113,20 +113,26 @@ fn spearman_rank_corr(s: &[Column], ddof: u8, propagate_nans: bool) -> PolarsRes let a = a.drop_nulls(); let b = b.drop_nulls(); - let a_rank = a.as_materialized_series().rank( - RankOptions { - method: RankMethod::Average, - ..Default::default() - }, - None, - ).into(); - let b_rank = b.as_materialized_series().rank( - RankOptions { - method: RankMethod::Average, - ..Default::default() - }, - None, - ).into(); + let a_rank = a + .as_materialized_series() + .rank( + RankOptions { + method: RankMethod::Average, + ..Default::default() + }, + None, + ) + .into(); + let b_rank = b + .as_materialized_series() + .rank( + RankOptions { + method: RankMethod::Average, + ..Default::default() + }, + None, + ) + .into(); pearson_corr(&[a_rank, b_rank], ddof) } diff --git a/crates/polars-plan/src/dsl/function_expr/cut.rs b/crates/polars-plan/src/dsl/function_expr/cut.rs new file mode 100644 index 000000000000..faafc7aa3f76 --- /dev/null +++ b/crates/polars-plan/src/dsl/function_expr/cut.rs @@ -0,0 +1,37 @@ +use polars_core::prelude::*; + +pub(crate) fn cut( + s: &Column, + breaks: Vec, + labels: Option>, + left_closed: bool, + include_breaks: bool, +) -> PolarsResult { + polars_ops::prelude::cut( + s.as_materialized_series(), + breaks, + labels, + left_closed, + include_breaks, + ) + .map(Column::from) +} + +pub(crate) fn qcut( + s: &Column, + probs: Vec, + labels: Option>, + left_closed: bool, + allow_duplicates: bool, + include_breaks: bool, +) -> PolarsResult { + polars_ops::prelude::qcut( + s.as_materialized_series(), + probs, + labels, + left_closed, + allow_duplicates, + include_breaks, + ) + .map(Column::from) +} diff --git a/crates/polars-plan/src/dsl/function_expr/datetime.rs b/crates/polars-plan/src/dsl/function_expr/datetime.rs index 9dbde708e7bf..436e1718d5e3 100644 --- a/crates/polars-plan/src/dsl/function_expr/datetime.rs +++ b/crates/polars-plan/src/dsl/function_expr/datetime.rs @@ -465,10 +465,7 @@ pub(super) fn truncate(s: &[Column]) -> PolarsResult { #[cfg(feature = "offset_by")] pub(super) fn offset_by(s: &[Column]) -> PolarsResult { - impl_offset_by( - &s[0].as_materialized_series(), - &s[1].as_materialized_series(), - ).map(Column::from) + impl_offset_by(s[0].as_materialized_series(), s[1].as_materialized_series()).map(Column::from) } #[cfg(feature = "month_start")] diff --git a/crates/polars-plan/src/dsl/function_expr/dispatch.rs b/crates/polars-plan/src/dsl/function_expr/dispatch.rs index fb73f51ab5de..6ee70819b4f7 100644 --- a/crates/polars-plan/src/dsl/function_expr/dispatch.rs +++ b/crates/polars-plan/src/dsl/function_expr/dispatch.rs @@ -16,11 +16,8 @@ pub(super) fn diff(s: &Column, n: i64, null_behavior: NullBehavior) -> PolarsRes #[cfg(feature = "pct_change")] pub(super) fn pct_change(s: &[Column]) -> PolarsResult { - polars_ops::prelude::pct_change( - &s[0].as_materialized_series(), - &s[1].as_materialized_series(), - ) - .map(Column::from) + polars_ops::prelude::pct_change(s[0].as_materialized_series(), s[1].as_materialized_series()) + .map(Column::from) } #[cfg(feature = "interpolate")] @@ -175,9 +172,9 @@ pub(super) fn hist( #[cfg(feature = "replace")] pub(super) fn replace(s: &[Column]) -> PolarsResult { polars_ops::series::replace( - &s[0].as_materialized_series(), - &s[1].as_materialized_series(), - &s[2].as_materialized_series(), + s[0].as_materialized_series(), + s[1].as_materialized_series(), + s[2].as_materialized_series(), ) .map(Column::from) } @@ -186,16 +183,16 @@ pub(super) fn replace(s: &[Column]) -> PolarsResult { pub(super) fn replace_strict(s: &[Column], return_dtype: Option) -> PolarsResult { match s.get(3) { Some(default) => polars_ops::series::replace_or_default( - &s[0].as_materialized_series(), - &s[1].as_materialized_series(), - &s[2].as_materialized_series(), + s[0].as_materialized_series(), + s[1].as_materialized_series(), + s[2].as_materialized_series(), default.as_materialized_series(), return_dtype, ), None => polars_ops::series::replace_strict( - &s[0].as_materialized_series(), - &s[1].as_materialized_series(), - &s[2].as_materialized_series(), + s[0].as_materialized_series(), + s[1].as_materialized_series(), + s[2].as_materialized_series(), return_dtype, ), } diff --git a/crates/polars-plan/src/dsl/function_expr/list.rs b/crates/polars-plan/src/dsl/function_expr/list.rs index c2badaecea8d..35467eff92bc 100644 --- a/crates/polars-plan/src/dsl/function_expr/list.rs +++ b/crates/polars-plan/src/dsl/function_expr/list.rs @@ -508,7 +508,8 @@ pub(super) fn gather(args: &[Column], null_on_oob: bool) -> PolarsResult // make sure we return a list out.reshape_list(&[-1, 1]) } else { - ca.lst_gather(idx.as_materialized_series(), null_on_oob).map(Column::from) + ca.lst_gather(idx.as_materialized_series(), null_on_oob) + .map(Column::from) } } @@ -518,7 +519,9 @@ pub(super) fn gather_every(args: &[Column]) -> PolarsResult { let n = &args[1].strict_cast(&IDX_DTYPE)?; let offset = &args[2].strict_cast(&IDX_DTYPE)?; - ca.list()?.lst_gather_every(n.idx()?, offset.idx()?).map(Column::from) + ca.list()? + .lst_gather_every(n.idx()?, offset.idx()?) + .map(Column::from) } #[cfg(feature = "list_count")] @@ -600,10 +603,10 @@ pub(super) fn set_operation(s: &[Column], set_type: SetOperation) -> PolarsResul let s0 = &s[0]; let s1 = &s[1]; - if s0.len() == 0 || s1.len() == 0 { + if s0.is_empty() || s1.is_empty() { return match set_type { SetOperation::Intersection => { - if s0.len() == 0 { + if s0.is_empty() { Ok(s0.clone()) } else { Ok(s1.clone().with_name(s0.name().clone())) @@ -611,7 +614,7 @@ pub(super) fn set_operation(s: &[Column], set_type: SetOperation) -> PolarsResul }, SetOperation::Difference => Ok(s0.clone()), SetOperation::Union | SetOperation::SymmetricDifference => { - if s0.len() == 0 { + if s0.is_empty() { Ok(s1.clone().with_name(s0.name().clone())) } else { Ok(s0.clone()) diff --git a/crates/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs index 0f66344a2cba..0f29723eee4e 100644 --- a/crates/polars-plan/src/dsl/function_expr/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/mod.rs @@ -20,6 +20,7 @@ mod concat; mod correlation; #[cfg(feature = "cum_agg")] mod cum; +mod cut; #[cfg(feature = "temporal")] mod datetime; mod dispatch; @@ -1074,7 +1075,7 @@ impl From for SpecialEq> { left_closed, include_breaks, } => map!( - cut, + cut::cut, breaks.clone(), labels.clone(), left_closed, @@ -1088,7 +1089,7 @@ impl From for SpecialEq> { allow_duplicates, include_breaks, } => map!( - qcut, + cut::qcut, probs.clone(), labels.clone(), left_closed, diff --git a/crates/polars-plan/src/dsl/function_expr/rolling_by.rs b/crates/polars-plan/src/dsl/function_expr/rolling_by.rs index dfc6e34e8c5a..3077c83355f2 100644 --- a/crates/polars-plan/src/dsl/function_expr/rolling_by.rs +++ b/crates/polars-plan/src/dsl/function_expr/rolling_by.rs @@ -44,7 +44,7 @@ pub(super) fn rolling_min_by( ) -> PolarsResult { // @scalar-opt s[0].as_materialized_series() - .rolling_min_by(&s[1].as_materialized_series(), options) + .rolling_min_by(s[1].as_materialized_series(), options) .map(Column::from) } @@ -54,7 +54,7 @@ pub(super) fn rolling_max_by( ) -> PolarsResult { // @scalar-opt s[0].as_materialized_series() - .rolling_max_by(&s[1].as_materialized_series(), options) + .rolling_max_by(s[1].as_materialized_series(), options) .map(Column::from) } @@ -64,7 +64,7 @@ pub(super) fn rolling_mean_by( ) -> PolarsResult { // @scalar-opt s[0].as_materialized_series() - .rolling_mean_by(&s[1].as_materialized_series(), options) + .rolling_mean_by(s[1].as_materialized_series(), options) .map(Column::from) } @@ -74,7 +74,7 @@ pub(super) fn rolling_sum_by( ) -> PolarsResult { // @scalar-opt s[0].as_materialized_series() - .rolling_sum_by(&s[1].as_materialized_series(), options) + .rolling_sum_by(s[1].as_materialized_series(), options) .map(Column::from) } @@ -84,7 +84,7 @@ pub(super) fn rolling_quantile_by( ) -> PolarsResult { // @scalar-opt s[0].as_materialized_series() - .rolling_quantile_by(&s[1].as_materialized_series(), options) + .rolling_quantile_by(s[1].as_materialized_series(), options) .map(Column::from) } @@ -94,7 +94,7 @@ pub(super) fn rolling_var_by( ) -> PolarsResult { // @scalar-opt s[0].as_materialized_series() - .rolling_var_by(&s[1].as_materialized_series(), options) + .rolling_var_by(s[1].as_materialized_series(), options) .map(Column::from) } @@ -104,6 +104,6 @@ pub(super) fn rolling_std_by( ) -> PolarsResult { // @scalar-opt s[0].as_materialized_series() - .rolling_std_by(&s[1].as_materialized_series(), options) + .rolling_std_by(s[1].as_materialized_series(), options) .map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/shrink_type.rs b/crates/polars-plan/src/dsl/function_expr/shrink_type.rs index 8489adda82a8..99dbb97cc67c 100644 --- a/crates/polars-plan/src/dsl/function_expr/shrink_type.rs +++ b/crates/polars-plan/src/dsl/function_expr/shrink_type.rs @@ -37,5 +37,6 @@ pub(super) fn shrink(s: Column) -> PolarsResult { } else { Ok(s.clone()) } - }.map(Column::from) + } + .map(Column::from) } diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index d3743463d308..ba06dc00e67c 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -672,7 +672,7 @@ fn to_date(s: &Column, options: &StrptimeOptions) -> PolarsResult { }; if options.strict && ca.null_count() != out.null_count() { - handle_casting_failures(s.as_materialized_series(), &out.as_materialized_series())?; + handle_casting_failures(s.as_materialized_series(), out.as_materialized_series())?; } Ok(out.into_column()) } @@ -719,7 +719,7 @@ fn to_datetime( }; if options.strict && datetime_strings.null_count() != out.null_count() { - handle_casting_failures(&s[0].as_materialized_series(), &out.as_materialized_series())?; + handle_casting_failures(s[0].as_materialized_series(), out.as_materialized_series())?; } Ok(out.into_column()) } @@ -736,7 +736,7 @@ fn to_time(s: &Column, options: &StrptimeOptions) -> PolarsResult { .into_column(); if options.strict && ca.null_count() != out.null_count() { - handle_casting_failures(s.as_materialized_series(), &out.as_materialized_series())?; + handle_casting_failures(s.as_materialized_series(), out.as_materialized_series())?; } Ok(out.into_column()) } diff --git a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs index 0e7080b0461b..c0d83822aef9 100644 --- a/crates/polars-plan/src/dsl/function_expr/trigonometry.rs +++ b/crates/polars-plan/src/dsl/function_expr/trigonometry.rs @@ -113,7 +113,9 @@ where { let dtype = T::get_dtype(); let x = x.cast(&dtype)?; - let x = y.unpack_series_matching_type(x.as_materialized_series()).unwrap(); + let x = y + .unpack_series_matching_type(x.as_materialized_series()) + .unwrap(); if x.len() == 1 { let x_value = x diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index a2cf069e1db3..786867f32e14 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -1512,7 +1512,7 @@ impl Expr { if let DataType::Float32 = c.dtype() { out.cast(&DataType::Float32).map(Column::from).map(Some) } else { - Ok(Some(out.into())) + Ok(Some(out)) } }, GetOutput::map_field(|field| { diff --git a/crates/polars-plan/src/dsl/python_udf.rs b/crates/polars-plan/src/dsl/python_udf.rs index 0fb786db493d..0f9ac4a3dc9a 100644 --- a/crates/polars-plan/src/dsl/python_udf.rs +++ b/crates/polars-plan/src/dsl/python_udf.rs @@ -3,8 +3,8 @@ use std::sync::Arc; use polars_core::datatypes::{DataType, Field}; use polars_core::error::*; -use polars_core::frame::DataFrame; use polars_core::frame::column::Column; +use polars_core::frame::DataFrame; use polars_core::schema::Schema; use pyo3::prelude::*; use pyo3::pybacked::PyBackedBytes; diff --git a/crates/polars-plan/src/dsl/udf.rs b/crates/polars-plan/src/dsl/udf.rs index 74371639a54a..b09ef6f556a2 100644 --- a/crates/polars-plan/src/dsl/udf.rs +++ b/crates/polars-plan/src/dsl/udf.rs @@ -5,7 +5,7 @@ use polars_core::prelude::Field; use polars_core::schema::Schema; use polars_utils::pl_str::PlSmallStr; -use super::{Expr, GetOutput, ColumnsUdf, SpecialEq}; +use super::{ColumnsUdf, Expr, GetOutput, SpecialEq}; use crate::prelude::{Context, FunctionOptions}; /// Represents a user-defined function diff --git a/crates/polars-python/src/interop/numpy/to_numpy_df.rs b/crates/polars-python/src/interop/numpy/to_numpy_df.rs index b249970c438d..c14753bdc7a3 100644 --- a/crates/polars-python/src/interop/numpy/to_numpy_df.rs +++ b/crates/polars-python/src/interop/numpy/to_numpy_df.rs @@ -113,7 +113,10 @@ fn check_df_dtypes_support_view(df: &DataFrame) -> Option<&DataType> { fn check_df_columns_contiguous(df: &DataFrame) -> bool { let columns = df.get_columns(); - if columns.iter().any(|s| s.as_materialized_series().n_chunks() > 1) { + if columns + .iter() + .any(|s| s.as_materialized_series().n_chunks() > 1) + { return false; } if columns.len() <= 1 { @@ -174,7 +177,13 @@ where T: PolarsNumericType, T::Native: Element, { - let ca: &ChunkedArray = df.get_columns().first().unwrap().as_materialized_series().unpack().unwrap(); + let ca: &ChunkedArray = df + .get_columns() + .first() + .unwrap() + .as_materialized_series() + .unpack() + .unwrap(); let first_slice = ca.data_views().next().unwrap(); let start_ptr = first_slice.as_ptr(); diff --git a/crates/polars-time/src/group_by/dynamic.rs b/crates/polars-time/src/group_by/dynamic.rs index 480c678e920c..8a8d2312d580 100644 --- a/crates/polars-time/src/group_by/dynamic.rs +++ b/crates/polars-time/src/group_by/dynamic.rs @@ -787,7 +787,10 @@ mod test { let expected = Series::new("".into(), [0.0, 8.0, 8.0, 9.333333333333343, 24.5, 0.0]); assert!(abs(&(var - expected)?).unwrap().lt(1e-12).unwrap().all()); - let quantile = unsafe { a.as_materialized_series().agg_quantile(&groups, 0.5, QuantileInterpolOptions::Linear) }; + let quantile = unsafe { + a.as_materialized_series() + .agg_quantile(&groups, 0.5, QuantileInterpolOptions::Linear) + }; let expected = Series::new("".into(), [3.0, 5.0, 5.0, 6.0, 5.5, 1.0]); assert_eq!(quantile, expected); diff --git a/crates/polars/tests/it/core/joins.rs b/crates/polars/tests/it/core/joins.rs index 9388a94d5960..92f3d883f9dc 100644 --- a/crates/polars/tests/it/core/joins.rs +++ b/crates/polars/tests/it/core/joins.rs @@ -137,7 +137,14 @@ fn test_full_outer_join() -> PolarsResult<()> { JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), )?; assert_eq!(joined.height(), 5); - assert_eq!(joined.column("days")?.as_materialized_series().sum::().unwrap(), 7); + assert_eq!( + joined + .column("days")? + .as_materialized_series() + .sum::() + .unwrap(), + 7 + ); let df_left = df!( "a"=> ["a", "b", "a", "z"], diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs index b1285d5710b6..ebd7419c514e 100644 --- a/crates/polars/tests/it/io/csv.rs +++ b/crates/polars/tests/it/io/csv.rs @@ -221,7 +221,14 @@ fn test_parser() -> PolarsResult<()> { assert_eq!(col.get(2)?, AnyValue::String("Setosa")); assert_eq!("sepal_length", df.get_columns()[0].name().as_str()); - assert_eq!(1, df.column("sepal_length").unwrap().as_materialized_series().chunks().len()); + assert_eq!( + 1, + df.column("sepal_length") + .unwrap() + .as_materialized_series() + .chunks() + .len() + ); assert_eq!(df.height(), 7); // test windows line endings diff --git a/docs/src/rust/user-guide/expressions/lists.rs b/docs/src/rust/user-guide/expressions/lists.rs index 9ce160cd58aa..fd097d98df7e 100644 --- a/docs/src/rust/user-guide/expressions/lists.rs +++ b/docs/src/rust/user-guide/expressions/lists.rs @@ -142,8 +142,8 @@ fn main() -> Result<(), Box> { col2.append_slice(&[1, 7, 3]); col2.append_slice(&[8, 1, 0]); let array_df = DataFrame::new(vec![ - col1.finish().into_series(), - col2.finish().into_series(), + col1.finish().into_column(), + col2.finish().into_column(), ])?; println!("{}", &array_df); From 95be5d7fd909ecc8627503bd8499aa0273af4f46 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 17:07:42 +0200 Subject: [PATCH 14/42] fix more clippy issues --- crates/polars-core/src/frame/column.rs | 38 +++++++++++++------ .../rust/user-guide/expressions/structs.rs | 4 +- .../transformations/time-series/rolling.rs | 2 +- .../transformations/time-series/timezones.rs | 2 +- 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index ecad8a22aab6..35478cbb3a24 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -23,7 +23,7 @@ pub enum Column { #[derive(Debug, Clone)] pub struct ScalarColumn { name: PlSmallStr, - value: AnyValue<'static>, + value: Scalar, materialized: OnceLock, length: usize, } @@ -49,7 +49,7 @@ impl Column { } #[inline] - pub fn new_scalar(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Self { + pub fn new_scalar(name: PlSmallStr, value: Scalar, length: usize) -> Self { Self::Scalar(ScalarColumn::new(name, value, length)) } @@ -85,14 +85,21 @@ impl Column { #[inline] pub fn dtype(&self) -> &DataType { - // @scalar-opt - self.as_materialized_series().dtype() + match self { + Column::Series(s) => s.dtype(), + Column::Scalar(s) => s.value.dtype(), + } } #[inline] pub fn field(&self) -> Cow { - // @scalar-opt - self.as_materialized_series().field() + match self { + Column::Series(s) => s.field(), + Column::Scalar(s) => match s.materialized.get() { + None => Cow::Owned(Field::new(s.name.clone(), s.value.dtype().clone())), + Some(s) => s.field(), + }, + } } #[inline] @@ -166,6 +173,7 @@ impl Column { self.as_materialized_series().str() } + #[cfg(feature = "dtype-datetime")] pub fn datetime(&self) -> PolarsResult<&DatetimeChunked> { // @scalar-opt self.as_materialized_series().datetime() @@ -461,6 +469,7 @@ impl Column { } } + #[cfg(feature = "dtype-categorical")] pub fn categorical(&self) -> PolarsResult<&CategoricalChunked> { self.as_materialized_series().categorical() } @@ -682,6 +691,7 @@ impl Column { .map(Self::from) } + #[cfg(feature = "dtype-array")] pub fn array(&self) -> PolarsResult<&ArrayChunked> { // @scalar-opt self.as_materialized_series().array() @@ -724,11 +734,13 @@ impl Column { self.as_materialized_series().is_not_nan() } + #[cfg(feature = "dtype-date")] pub fn date(&self) -> PolarsResult<&DateChunked> { // @scalar-opt self.as_materialized_series().date() } + #[cfg(feature = "dtype-duration")] pub fn duration(&self) -> PolarsResult<&DurationChunked> { // @scalar-opt self.as_materialized_series().duration() @@ -770,6 +782,12 @@ impl Column { pub fn get_object(&self, index: usize) -> Option<&dyn PolarsObjectSafe> { self.as_materialized_series().get_object(index) } + + pub fn bitand(&self, rhs: &Self) -> PolarsResult { + self.as_materialized_series() + .bitand(rhs.as_materialized_series()) + .map(Column::from) + } } impl Default for Column { @@ -989,7 +1007,7 @@ where impl ScalarColumn { #[inline] - pub fn new(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Self { + pub fn new(name: PlSmallStr, value: Scalar, length: usize) -> Self { Self { name, value, @@ -998,11 +1016,9 @@ impl ScalarColumn { } } - fn _to_series(name: PlSmallStr, value: AnyValue<'static>, length: usize) -> Series { + fn _to_series(name: PlSmallStr, value: Scalar, length: usize) -> Series { // @TODO: There is probably a better way to do this. - Scalar::new(value.dtype(), value) - .into_series(name) - .new_from_index(0, length) + value.into_series(name).new_from_index(0, length) } pub fn to_series(&self) -> Series { diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs index 25ed02daf827..cc6fff831d06 100644 --- a/docs/src/rust/user-guide/expressions/structs.rs +++ b/docs/src/rust/user-guide/expressions/structs.rs @@ -50,7 +50,7 @@ fn main() -> Result<(), Box> { // --8<-- [end:series_struct_extract] // --8<-- [start:series_struct_rename] - let out = DataFrame::new([rating_series].into())? + let out = DataFrame::new([rating_series.into_column()].into())? .lazy() .select([col("ratings") .struct_() @@ -130,7 +130,7 @@ fn main() -> Result<(), Box> { }) .collect(); - Ok(Some(out.into_series())) + Ok(Some(out.into_column())) }, GetOutput::from_type(DataType::Int32), ) diff --git a/docs/src/rust/user-guide/transformations/time-series/rolling.rs b/docs/src/rust/user-guide/transformations/time-series/rolling.rs index 19b57f2d0c33..4db0ea1be92a 100644 --- a/docs/src/rust/user-guide/transformations/time-series/rolling.rs +++ b/docs/src/rust/user-guide/transformations/time-series/rolling.rs @@ -93,7 +93,7 @@ fn main() -> Result<(), Box> { .into_iter() .map(|d| d.map(|v| v / 1000 / 24 / 60 / 60)) .collect::() - .into_series(), + .into_column(), )) }, GetOutput::from_type(DataType::Int64), diff --git a/docs/src/rust/user-guide/transformations/time-series/timezones.rs b/docs/src/rust/user-guide/transformations/time-series/timezones.rs index 489786cb844e..476a7a332b5c 100644 --- a/docs/src/rust/user-guide/transformations/time-series/timezones.rs +++ b/docs/src/rust/user-guide/transformations/time-series/timezones.rs @@ -5,7 +5,7 @@ use polars::prelude::*; fn main() -> Result<(), Box> { // --8<-- [start:example] let ts = ["2021-03-27 03:00", "2021-03-28 03:00"]; - let tz_naive = Series::new("tz_naive".into(), &ts); + let tz_naive = Column::new("tz_naive".into(), &ts); let time_zones_df = DataFrame::new(vec![tz_naive])? .lazy() .select([col("tz_naive").str().to_datetime( From 5617a00b634b6188242407bdc374d9468d74d971 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 17:12:38 +0200 Subject: [PATCH 15/42] feature gate struct_ --- crates/polars-core/src/frame/column.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index 35478cbb3a24..bc3de52c82f9 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -518,6 +518,7 @@ impl Column { self.as_materialized_series().bool() } + #[cfg(feature = "dtype-struct")] pub fn struct_(&self) -> PolarsResult<&StructChunked> { // @scalar-opt self.as_materialized_series().struct_() From da653515688c22030387f85c06d31889295b4f63 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 17:17:58 +0200 Subject: [PATCH 16/42] fix doc building issues --- crates/polars-core/src/chunked_array/temporal/mod.rs | 2 +- crates/polars-core/src/frame/horizontal.rs | 6 +++--- crates/polars-plan/src/dsl/expr_dyn_fn.rs | 4 ++++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/crates/polars-core/src/chunked_array/temporal/mod.rs b/crates/polars-core/src/chunked_array/temporal/mod.rs index e3ab1c01c164..ad35aa90899e 100644 --- a/crates/polars-core/src/chunked_array/temporal/mod.rs +++ b/crates/polars-core/src/chunked_array/temporal/mod.rs @@ -64,7 +64,7 @@ pub fn parse_time_zone(tz: &str) -> PolarsResult { /// /// E.g. +01:00 -> Etc/GMT-1 /// -/// Note: the sign appears reversed, but is correct, see https://en.wikipedia.org/wiki/Tz_database#Area: +/// Note: the sign appears reversed, but is correct, see : /// > In order to conform with the POSIX style, those zone names beginning with /// > "Etc/GMT" have their sign reversed from the standard ISO 8601 convention. /// > In the "Etc" area, zones west of GMT have a positive sign and those east diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index 17bd1936a8f6..cf65b807f3e5 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -29,21 +29,21 @@ impl DataFrame { /// /// # Safety /// The caller must ensure: - /// - the length of all [`Series`] is equal to the height of this [`DataFrame`] + /// - the length of all [`Column`] is equal to the height of this [`DataFrame`] /// - the columns names are unique pub unsafe fn hstack_mut_unchecked(&mut self, columns: &[Column]) -> &mut Self { self.columns.extend_from_slice(columns); self } - /// Add multiple [`Series`] to a [`DataFrame`]. + /// Add multiple [`Column`] to a [`DataFrame`]. /// The added `Series` are required to have the same length. /// /// # Example /// /// ```rust /// # use polars_core::prelude::*; - /// fn stack(df: &mut DataFrame, columns: &[Series]) { + /// fn stack(df: &mut DataFrame, columns: &[Column]) { /// df.hstack_mut(columns); /// } /// ``` diff --git a/crates/polars-plan/src/dsl/expr_dyn_fn.rs b/crates/polars-plan/src/dsl/expr_dyn_fn.rs index d79858706385..e134e8b556ef 100644 --- a/crates/polars-plan/src/dsl/expr_dyn_fn.rs +++ b/crates/polars-plan/src/dsl/expr_dyn_fn.rs @@ -68,6 +68,8 @@ impl<'a> Deserialize<'a> for SpecialEq> { } #[cfg(not(feature = "python"))] { + _ = deserializer; + Err(D::Error::custom( "deserialization not supported for this 'opaque' function", )) @@ -394,6 +396,8 @@ impl<'a> Deserialize<'a> for GetOutput { } #[cfg(not(feature = "python"))] { + _ = deserializer; + Err(D::Error::custom( "deserialization not supported for this output field", )) From 21e6c7e9d72bc20a9d901a001507ca3ce8482efe Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 17:25:08 +0200 Subject: [PATCH 17/42] feature gate more things --- crates/polars-core/src/frame/chunks.rs | 5 +---- crates/polars-core/src/frame/column.rs | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/crates/polars-core/src/frame/chunks.rs b/crates/polars-core/src/frame/chunks.rs index 3fffbc1ce22f..704df0b7d140 100644 --- a/crates/polars-core/src/frame/chunks.rs +++ b/crates/polars-core/src/frame/chunks.rs @@ -29,10 +29,7 @@ impl DataFrame { let columns = self .get_columns() .iter() - .map(|column| match column { - Column::Series(s) => s.select_chunk(i), - Column::Scalar(s) => s.select_chunk(i), - }) + .map(|column| column.as_materialized_series().select_chunk(i)) .map(Column::from) .collect::>(); diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index bc3de52c82f9..e74ed8c9217d 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -7,7 +7,6 @@ use polars_error::PolarsResult; use polars_utils::pl_str::PlSmallStr; use crate::chunked_array::metadata::MetadataFlags; -use crate::chunked_array::object::PolarsObjectSafe; use crate::prelude::*; use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; @@ -615,6 +614,7 @@ impl Column { self.as_materialized_series().get(index) } + #[cfg(feature = "dtype-decimal")] pub fn decimal(&self) -> PolarsResult<&DecimalChunked> { // @scalar-opt self.as_materialized_series().decimal() @@ -632,6 +632,7 @@ impl Column { .map(Self::from) } + #[cfg(feature = "dtype-array")] pub fn reshape_array(&self, dimensions: &[i64]) -> PolarsResult { // @scalar-opt self.as_materialized_series() @@ -651,11 +652,13 @@ impl Column { self.as_materialized_series().filter(filter).map(Self::from) } + #[cfg(feature = "random")] pub fn shuffle(&self, seed: Option) -> Self { // @scalar-opt self.as_materialized_series().shuffle(seed).into() } + #[cfg(feature = "random")] pub fn sample_frac( &self, frac: f64, @@ -668,6 +671,7 @@ impl Column { .map(Self::from) } + #[cfg(feature = "random")] pub fn sample_n( &self, n: usize, @@ -780,7 +784,11 @@ impl Column { self.as_materialized_series().get_unchecked(index) } - pub fn get_object(&self, index: usize) -> Option<&dyn PolarsObjectSafe> { + #[cfg(feature = "object")] + pub fn get_object( + &self, + index: usize, + ) -> Option<&dyn crate::chunked_array::object::PolarsObjectSafe> { self.as_materialized_series().get_object(index) } @@ -1036,12 +1044,6 @@ impl ScalarColumn { .unwrap_or_else(|| Self::_to_series(self.name, self.value, self.length)) } - pub fn select_chunk(&self, _: usize) -> Series { - // @scalar-opt - // @scalar-correctness? - todo!() - } - fn with_name(self, name: PlSmallStr) -> Self { // @TODO: Keep materialized somehow? Self::new(name, self.value, self.length) From bc5f655ecc435ab20104b23d248130024b53e2d1 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 17:34:52 +0200 Subject: [PATCH 18/42] even more feature gating --- Cargo.lock | 480 +----------------- crates/polars-arrow/Cargo.toml | 17 - crates/polars-compute/Cargo.toml | 3 - crates/polars-core/Cargo.toml | 4 - crates/polars-core/src/frame/column.rs | 2 + crates/polars-io/Cargo.toml | 3 - crates/polars-lazy/Cargo.toml | 3 - crates/polars-ops/Cargo.toml | 3 - crates/polars-parquet/Cargo.toml | 3 - .../polars-plan/src/dsl/function_expr/mod.rs | 1 + crates/polars-sql/Cargo.toml | 5 - crates/polars-time/Cargo.toml | 3 - crates/polars-utils/Cargo.toml | 3 - crates/polars/Cargo.toml | 16 - 14 files changed, 9 insertions(+), 537 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94a0a87e3d8c..ac1d699be13d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -88,50 +88,12 @@ dependencies = [ "libc", ] -[[package]] -name = "anes" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" - -[[package]] -name = "anstyle" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" - [[package]] name = "anyhow" version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" -[[package]] -name = "apache-avro" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aef82843a0ec9f8b19567445ad2421ceeb1d711514384bdd3d49fe37102ee13" -dependencies = [ - "bigdecimal", - "crc32fast", - "digest", - "libflate 2.1.0", - "log", - "num-bigint", - "quad-rand", - "rand", - "regex-lite", - "serde", - "serde_bytes", - "serde_json", - "snap", - "strum", - "strum_macros", - "thiserror", - "typed-builder", - "uuid", -] - [[package]] name = "arboard" version = "3.4.0" @@ -219,26 +181,6 @@ version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" -[[package]] -name = "arrow2" -version = "0.17.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59c468daea140b747d781a1da9f7db5f0a8e6636d4af20cc539e43d05b0604fa" -dependencies = [ - "ahash", - "bytemuck", - "chrono", - "dyn-clone", - "either", - "ethnum", - "foreign_vec", - "getrandom", - "hash_hasher", - "num-traits", - "rustc_version", - "simdutf8", -] - [[package]] name = "async-stream" version = "0.3.5" @@ -309,7 +251,7 @@ dependencies = [ "crc", "fallible-streaming-iterator", "futures", - "libflate 1.4.0", + "libflate", "serde", "serde_json", "snap", @@ -738,29 +680,6 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" -[[package]] -name = "bigdecimal" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d712318a27c7150326677b321a5fa91b55f6d9034ffd67f20319e147d40cee" -dependencies = [ - "autocfg", - "libm", - "num-bigint", - "num-integer", - "num-traits", - "serde", -] - -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bitflags" version = "2.6.0" @@ -893,21 +812,6 @@ dependencies = [ "url", ] -[[package]] -name = "casey" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614586263949597dcc18675da12ef9b429135e13628d92eb8b8c6fa50ca5656b" -dependencies = [ - "syn 1.0.109", -] - -[[package]] -name = "cast" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" - [[package]] name = "castaway" version = "0.2.3" @@ -996,31 +900,6 @@ dependencies = [ "half", ] -[[package]] -name = "clap" -version = "4.5.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019" -dependencies = [ - "clap_builder", -] - -[[package]] -name = "clap_builder" -version = "4.5.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" -dependencies = [ - "anstyle", - "clap_lex", -] - -[[package]] -name = "clap_lex" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" - [[package]] name = "clipboard-win" version = "5.4.0" @@ -1114,15 +993,6 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" -[[package]] -name = "core2" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" -dependencies = [ - "memchr", -] - [[package]] name = "cpufeatures" version = "0.2.13" @@ -1165,42 +1035,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "criterion" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" -dependencies = [ - "anes", - "cast", - "ciborium", - "clap", - "criterion-plot", - "is-terminal", - "itertools 0.10.5", - "num-traits", - "once_cell", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", -] - -[[package]] -name = "criterion-plot" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" -dependencies = [ - "cast", - "itertools 0.10.5", -] - [[package]] name = "crossbeam-channel" version = "0.5.13" @@ -1304,12 +1138,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "dary_heap" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" - [[package]] name = "der" version = "0.6.1" @@ -1405,16 +1233,6 @@ dependencies = [ "syn 2.0.76", ] -[[package]] -name = "env_logger" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" -dependencies = [ - "log", - "regex", -] - [[package]] name = "equivalent" version = "1.0.1" @@ -1497,12 +1315,6 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" -[[package]] -name = "foreign_vec" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" - [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1739,12 +1551,6 @@ dependencies = [ "serde", ] -[[package]] -name = "hash_hasher" -version = "2.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" - [[package]] name = "hashbrown" version = "0.14.5" @@ -1775,12 +1581,6 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" -[[package]] -name = "hermit-abi" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" - [[package]] name = "hex" version = "0.4.3" @@ -2039,17 +1839,6 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" -[[package]] -name = "is-terminal" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" -dependencies = [ - "hermit-abi 0.4.0", - "libc", - "windows-sys 0.52.0", -] - [[package]] name = "itertools" version = "0.10.5" @@ -2129,12 +1918,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - [[package]] name = "lexical-core" version = "0.8.5" @@ -2213,20 +1996,7 @@ checksum = "5ff4ae71b685bbad2f2f391fe74f6b7659a34871c08b210fdc039e43bee07d18" dependencies = [ "adler32", "crc32fast", - "libflate_lz77 1.2.0", -] - -[[package]] -name = "libflate" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" -dependencies = [ - "adler32", - "core2", - "crc32fast", - "dary_heap", - "libflate_lz77 2.1.0", + "libflate_lz77", ] [[package]] @@ -2238,17 +2008,6 @@ dependencies = [ "rle-decode-fast", ] -[[package]] -name = "libflate_lz77" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" -dependencies = [ - "core2", - "hashbrown", - "rle-decode-fast", -] - [[package]] name = "libgit2-sys" version = "0.17.0+1.8.1" @@ -2452,7 +2211,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ - "hermit-abi 0.3.9", + "hermit-abi", "libc", "wasi", "windows-sys 0.52.0", @@ -2533,7 +2292,6 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", - "serde", ] [[package]] @@ -2751,12 +2509,6 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" -[[package]] -name = "oorandom" -version = "11.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" - [[package]] name = "openssl-probe" version = "0.1.5" @@ -2923,45 +2675,10 @@ dependencies = [ "array-init-cursor", ] -[[package]] -name = "plotters" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a15b6eccb8484002195a3e44fe65a4ce8e93a625797a063735536fd59cb01cf3" -dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "plotters-backend" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "414cec62c6634ae900ea1c56128dfe87cf63e7caece0852ec76aba307cebadb7" - -[[package]] -name = "plotters-svg" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81b30686a7d9c3e010b84284bdd26a29f2138574f52f5eb6f794fc0ad924e705" -dependencies = [ - "plotters-backend", -] - [[package]] name = "polars" version = "0.43.1" dependencies = [ - "ahash", - "apache-avro", - "arrow-buffer", - "avro-schema", - "either", - "ethnum", - "futures", "getrandom", "polars-arrow", "polars-core", @@ -2974,10 +2691,6 @@ dependencies = [ "polars-sql", "polars-time", "polars-utils", - "proptest", - "rand", - "tokio", - "tokio-util", "version_check", ] @@ -2997,14 +2710,10 @@ dependencies = [ "bytemuck", "chrono", "chrono-tz", - "criterion", - "crossbeam-channel", - "doc-comment", "dyn-clone", "either", "ethnum", "fast-float", - "flate2", "futures", "getrandom", "hashbrown", @@ -3020,20 +2729,13 @@ dependencies = [ "polars-error", "polars-schema", "polars-utils", - "proptest", - "rand", "regex", - "regex-syntax 0.8.4", + "regex-syntax", "ryu", - "sample-arrow2", - "sample-std", - "sample-test", "serde", "simdutf8", "streaming-iterator", "strength_reduce", - "tokio", - "tokio-util", "version_check", "zstd", ] @@ -3060,7 +2762,6 @@ dependencies = [ "polars-arrow", "polars-error", "polars-utils", - "rand", "strength_reduce", "version_check", ] @@ -3071,7 +2772,6 @@ version = "0.43.1" dependencies = [ "ahash", "arrow-array", - "bincode", "bitflags", "bytemuck", "chrono", @@ -3194,7 +2894,6 @@ dependencies = [ "serde_json", "simd-json", "simdutf8", - "tempfile", "tokio", "tokio-util", "url", @@ -3244,7 +2943,6 @@ dependencies = [ "polars-utils", "pyo3", "rayon", - "serde_json", "tokio", "version_check", ] @@ -3327,7 +3025,6 @@ dependencies = [ "polars-compute", "polars-error", "polars-utils", - "rand", "serde", "simdutf8", "snap", @@ -3534,7 +3231,6 @@ dependencies = [ "num-traits", "once_cell", "polars-error", - "rand", "raw-cpuid", "rayon", "serde", @@ -3573,22 +3269,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "proptest" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c2511913b88df1637da85cc8d96ec8e43a3f8bb8ccb71ee1ac240d6f3df58d" -dependencies = [ - "bitflags", - "lazy_static", - "num-traits", - "rand", - "rand_chacha", - "rand_xorshift", - "regex-syntax 0.8.4", - "unarray", -] - [[package]] name = "prost" version = "0.11.9" @@ -3699,12 +3379,6 @@ dependencies = [ "syn 2.0.76", ] -[[package]] -name = "quad-rand" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658fa1faf7a4cc5f057c9ee5ef560f717ad9d8dc66d975267f709624d6e1ab88" - [[package]] name = "quick-xml" version = "0.36.1" @@ -3715,17 +3389,6 @@ dependencies = [ "serde", ] -[[package]] -name = "quickcheck" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" -dependencies = [ - "env_logger", - "log", - "rand", -] - [[package]] name = "quinn" version = "0.11.3" @@ -3823,25 +3486,6 @@ dependencies = [ "rand", ] -[[package]] -name = "rand_regex" -version = "0.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b2a9fe2d7d9eeaf3279d1780452a5bbd26b31b27938787ef1c3e930d1e9cfbd" -dependencies = [ - "rand", - "regex-syntax 0.6.29", -] - -[[package]] -name = "rand_xorshift" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" -dependencies = [ - "rand_core", -] - [[package]] name = "raw-cpuid" version = "11.1.0" @@ -3935,7 +3579,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax 0.8.4", + "regex-syntax", ] [[package]] @@ -3946,7 +3590,7 @@ checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.4", + "regex-syntax", ] [[package]] @@ -3955,12 +3599,6 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" -[[package]] -name = "regex-syntax" -version = "0.6.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" - [[package]] name = "regex-syntax" version = "0.8.4" @@ -4203,52 +3841,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "sample-arrow2" -version = "0.17.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "502b30097ae5cc57ee8359bb59d8af349db022492de04596119d83f561ab8977" -dependencies = [ - "arrow2", - "sample-std", -] - -[[package]] -name = "sample-std" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "948bd219c6eb2b2ca1e004d8aefa8bbcf12614f60e0139b1758b49f9a94358c8" -dependencies = [ - "casey", - "quickcheck", - "rand", - "rand_regex", - "regex", -] - -[[package]] -name = "sample-test" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8b253ca516416756b09b582e2b7275de8f51f35e5d5711e20712b9377c7d5bf" -dependencies = [ - "quickcheck", - "sample-std", - "sample-test-macros", -] - -[[package]] -name = "sample-test-macros" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cc6439a7589bb4581fdadb6391700ce4d26f8bffd34e2a75acb320822e9b5ef" -dependencies = [ - "proc-macro2", - "quote", - "sample-std", - "syn 1.0.109", -] - [[package]] name = "schannel" version = "0.1.23" @@ -4329,15 +3921,6 @@ dependencies = [ "serde_derive", ] -[[package]] -name = "serde_bytes" -version = "0.11.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "387cc504cb06bb40a96c8e04e951fe01854cf6bc921053c954e4a606d9675c6a" -dependencies = [ - "serde", -] - [[package]] name = "serde_derive" version = "1.0.209" @@ -4668,19 +4251,6 @@ version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" -[[package]] -name = "tempfile" -version = "3.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" -dependencies = [ - "cfg-if", - "fastrand", - "once_cell", - "rustix", - "windows-sys 0.59.0", -] - [[package]] name = "thiserror" version = "1.0.63" @@ -4740,16 +4310,6 @@ dependencies = [ "crunchy", ] -[[package]] -name = "tinytemplate" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" -dependencies = [ - "serde", - "serde_json", -] - [[package]] name = "tinyvec" version = "1.8.0" @@ -4822,7 +4382,6 @@ checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" dependencies = [ "bytes", "futures-core", - "futures-io", "futures-sink", "pin-project-lite", "tokio", @@ -4936,38 +4495,12 @@ dependencies = [ "static_assertions", ] -[[package]] -name = "typed-builder" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06fbd5b8de54c5f7c91f6fe4cebb949be2125d7758e630bb58b1d831dbce600" -dependencies = [ - "typed-builder-macro", -] - -[[package]] -name = "typed-builder-macro" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.76", -] - [[package]] name = "typenum" version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" -[[package]] -name = "unarray" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" - [[package]] name = "unicode-bidi" version = "0.3.15" @@ -5046,7 +4579,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ "getrandom", - "serde", ] [[package]] diff --git a/crates/polars-arrow/Cargo.toml b/crates/polars-arrow/Cargo.toml index 5e7e1eebff0a..4771ca468fa0 100644 --- a/crates/polars-arrow/Cargo.toml +++ b/crates/polars-arrow/Cargo.toml @@ -76,23 +76,6 @@ arrow-buffer = { workspace = true, optional = true } arrow-data = { workspace = true, optional = true } arrow-schema = { workspace = true, optional = true } -[dev-dependencies] -criterion = "0.5" -crossbeam-channel = { workspace = true } -doc-comment = "0.3" -flate2 = { workspace = true, default-features = true } -# used to run formal property testing -proptest = { version = "1", default-features = false, features = ["std"] } -# use for flaky testing -rand = { workspace = true } -# use for generating and testing random data samples -sample-arrow2 = "0.17" -sample-std = "0.2" -sample-test = "0.2" -# used to test async readers -tokio = { workspace = true, features = ["macros", "rt", "fs", "io-util"] } -tokio-util = { workspace = true, features = ["compat"] } - [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-compute/Cargo.toml b/crates/polars-compute/Cargo.toml index 4ade7134ec5e..04a4ea80a99b 100644 --- a/crates/polars-compute/Cargo.toml +++ b/crates/polars-compute/Cargo.toml @@ -17,9 +17,6 @@ polars-error = { workspace = true } polars-utils = { workspace = true } strength_reduce = { workspace = true } -[dev-dependencies] -rand = { workspace = true } - [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index a3f477e84dd6..05c51d5d2438 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -39,10 +39,6 @@ serde_json = { workspace = true, optional = true } thiserror = { workspace = true } xxhash-rust = { workspace = true } -[dev-dependencies] -bincode = { version = "1" } -serde_json = { workspace = true } - [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index e74ed8c9217d..0c0f54e4f920 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -588,6 +588,7 @@ impl Column { unsafe { self.as_materialized_series().cast_unchecked(dtype) }.map(Column::from) } + #[cfg(feature = "zip_with")] pub fn zip_with_same_type( &self, mask: &ChunkedArray, @@ -712,6 +713,7 @@ impl Column { self.as_materialized_series().is_null() } + #[cfg(feature = "zip_with")] pub fn zip_with(&self, mask: &BooleanChunked, other: &Self) -> PolarsResult { // @scalar-opt self.as_materialized_series() diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index ca3d313e08ae..224786e9b02c 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -54,9 +54,6 @@ zstd = { workspace = true, optional = true } fs4 = { version = "0.9", features = ["sync"], optional = true } home = "0.5.4" -[dev-dependencies] -tempfile = "3" - [features] default = ["decompress"] # support for arrows json parsing diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index 333fdc1211d2..67b0018b44e5 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -31,9 +31,6 @@ pyo3 = { workspace = true, optional = true } rayon = { workspace = true } tokio = { workspace = true, optional = true } -[dev-dependencies] -serde_json = { workspace = true } - [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 2f37857c9cd2..48f8ae9b50a2 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -43,9 +43,6 @@ package = "jsonpath_lib_polars_vendor" optional = true version = "0.0.1" -[dev-dependencies] -rand = { workspace = true, features = ["small_rng"] } - [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-parquet/Cargo.toml b/crates/polars-parquet/Cargo.toml index 26a57b22e713..8f41a5d35ad1 100644 --- a/crates/polars-parquet/Cargo.toml +++ b/crates/polars-parquet/Cargo.toml @@ -42,9 +42,6 @@ zstd = { version = "^0.13", optional = true, default-features = false } xxhash-rust = { version = "0.8", optional = true, features = ["xxh64"] } -[dev-dependencies] -rand = "0.8" - [features] compression = [ "zstd", diff --git a/crates/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs index 0f29723eee4e..a5049b84eb1b 100644 --- a/crates/polars-plan/src/dsl/function_expr/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/mod.rs @@ -20,6 +20,7 @@ mod concat; mod correlation; #[cfg(feature = "cum_agg")] mod cum; +#[cfg(feature = "cutqcut")] mod cut; #[cfg(feature = "temporal")] mod datetime; diff --git a/crates/polars-sql/Cargo.toml b/crates/polars-sql/Cargo.toml index 9db54d1c3333..e334ff8d794a 100644 --- a/crates/polars-sql/Cargo.toml +++ b/crates/polars-sql/Cargo.toml @@ -24,11 +24,6 @@ rand = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sqlparser = { workspace = true } -# sqlparser = { git = "https://github.com/sqlparser-rs/sqlparser-rs.git", rev = "ae3b5844c839072c235965fe0d1bddc473dced87" } - -[dev-dependencies] -# to display dataframes in case of test failures -polars-core = { workspace = true, features = ["fmt"] } [features] default = [] diff --git a/crates/polars-time/Cargo.toml b/crates/polars-time/Cargo.toml index d75d634d213d..0d405241bd66 100644 --- a/crates/polars-time/Cargo.toml +++ b/crates/polars-time/Cargo.toml @@ -24,9 +24,6 @@ once_cell = { workspace = true } regex = { workspace = true } serde = { workspace = true, optional = true } -[dev-dependencies] -polars-ops = { workspace = true, features = ["abs"] } - [features] dtype-date = ["polars-core/dtype-date", "temporal"] dtype-datetime = ["polars-core/dtype-datetime", "temporal"] diff --git a/crates/polars-utils/Cargo.toml b/crates/polars-utils/Cargo.toml index 442d319b7753..e9071b2c9dc6 100644 --- a/crates/polars-utils/Cargo.toml +++ b/crates/polars-utils/Cargo.toml @@ -27,9 +27,6 @@ serde = { workspace = true, optional = true } stacker = { workspace = true } sysinfo = { version = "0.31", default-features = false, features = ["system"], optional = true } -[dev-dependencies] -rand = { workspace = true } - [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index b858dbc36678..05a1c8673f70 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -23,22 +23,6 @@ polars-sql = { workspace = true, optional = true } polars-time = { workspace = true, optional = true } polars-utils = { workspace = true } -[dev-dependencies] -ahash = { workspace = true } -apache-avro = { version = "0.17", features = ["snappy"] } -arrow = { workspace = true, features = ["arrow_rs"] } -arrow-buffer = { workspace = true } -avro-schema = { workspace = true, features = ["async"] } -either = { workspace = true } -ethnum = "1" -futures = { workspace = true } -# used to run formal property testing -proptest = { version = "1", default-features = false, features = ["std"] } -rand = { workspace = true } -# used to test async readers -tokio = { workspace = true, features = ["macros", "rt", "fs", "io-util"] } -tokio-util = { workspace = true, features = ["compat"] } - [build-dependencies] version_check = { workspace = true } From 0ecd61da41cd0d6ea42803adc92282781dc38b3b Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 17:38:09 +0200 Subject: [PATCH 19/42] add polars_utils dev dependency --- Cargo.lock | 1 + crates/polars-utils/Cargo.toml | 3 +++ 2 files changed, 4 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index ac1d699be13d..3bf1162f5129 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3231,6 +3231,7 @@ dependencies = [ "num-traits", "once_cell", "polars-error", + "rand", "raw-cpuid", "rayon", "serde", diff --git a/crates/polars-utils/Cargo.toml b/crates/polars-utils/Cargo.toml index e9071b2c9dc6..62c96dc5e4fc 100644 --- a/crates/polars-utils/Cargo.toml +++ b/crates/polars-utils/Cargo.toml @@ -30,6 +30,9 @@ sysinfo = { version = "0.31", default-features = false, features = ["system"], o [build-dependencies] version_check = { workspace = true } +[dev-dependencies] +rand = { workspace = true } + [features] mmap = ["memmap"] bigidx = [] From b86c33b825dba4fbce175372971b68e73a8ac0c9 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 17:40:36 +0200 Subject: [PATCH 20/42] idk --- Cargo.lock | 479 ++++++++++++++++++++++++++++++- crates/polars-arrow/Cargo.toml | 17 ++ crates/polars-compute/Cargo.toml | 3 + crates/polars-core/Cargo.toml | 4 + crates/polars-io/Cargo.toml | 3 + crates/polars-lazy/Cargo.toml | 3 + crates/polars-ops/Cargo.toml | 3 + crates/polars-parquet/Cargo.toml | 3 + crates/polars-sql/Cargo.toml | 5 + crates/polars-time/Cargo.toml | 3 + crates/polars-utils/Cargo.toml | 6 +- crates/polars/Cargo.toml | 16 ++ 12 files changed, 536 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3bf1162f5129..94a0a87e3d8c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -88,12 +88,50 @@ dependencies = [ "libc", ] +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" + [[package]] name = "anyhow" version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" +[[package]] +name = "apache-avro" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aef82843a0ec9f8b19567445ad2421ceeb1d711514384bdd3d49fe37102ee13" +dependencies = [ + "bigdecimal", + "crc32fast", + "digest", + "libflate 2.1.0", + "log", + "num-bigint", + "quad-rand", + "rand", + "regex-lite", + "serde", + "serde_bytes", + "serde_json", + "snap", + "strum", + "strum_macros", + "thiserror", + "typed-builder", + "uuid", +] + [[package]] name = "arboard" version = "3.4.0" @@ -181,6 +219,26 @@ version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" +[[package]] +name = "arrow2" +version = "0.17.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59c468daea140b747d781a1da9f7db5f0a8e6636d4af20cc539e43d05b0604fa" +dependencies = [ + "ahash", + "bytemuck", + "chrono", + "dyn-clone", + "either", + "ethnum", + "foreign_vec", + "getrandom", + "hash_hasher", + "num-traits", + "rustc_version", + "simdutf8", +] + [[package]] name = "async-stream" version = "0.3.5" @@ -251,7 +309,7 @@ dependencies = [ "crc", "fallible-streaming-iterator", "futures", - "libflate", + "libflate 1.4.0", "serde", "serde_json", "snap", @@ -680,6 +738,29 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bigdecimal" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d712318a27c7150326677b321a5fa91b55f6d9034ffd67f20319e147d40cee" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", + "serde", +] + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "2.6.0" @@ -812,6 +893,21 @@ dependencies = [ "url", ] +[[package]] +name = "casey" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614586263949597dcc18675da12ef9b429135e13628d92eb8b8c6fa50ca5656b" +dependencies = [ + "syn 1.0.109", +] + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "castaway" version = "0.2.3" @@ -900,6 +996,31 @@ dependencies = [ "half", ] +[[package]] +name = "clap" +version = "4.5.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" + [[package]] name = "clipboard-win" version = "5.4.0" @@ -993,6 +1114,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + [[package]] name = "cpufeatures" version = "0.2.13" @@ -1035,6 +1165,42 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools 0.10.5", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools 0.10.5", +] + [[package]] name = "crossbeam-channel" version = "0.5.13" @@ -1138,6 +1304,12 @@ dependencies = [ "typenum", ] +[[package]] +name = "dary_heap" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" + [[package]] name = "der" version = "0.6.1" @@ -1233,6 +1405,16 @@ dependencies = [ "syn 2.0.76", ] +[[package]] +name = "env_logger" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" +dependencies = [ + "log", + "regex", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -1315,6 +1497,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1551,6 +1739,12 @@ dependencies = [ "serde", ] +[[package]] +name = "hash_hasher" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" + [[package]] name = "hashbrown" version = "0.14.5" @@ -1581,6 +1775,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "hex" version = "0.4.3" @@ -1839,6 +2039,17 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" +[[package]] +name = "is-terminal" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" +dependencies = [ + "hermit-abi 0.4.0", + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "itertools" version = "0.10.5" @@ -1918,6 +2129,12 @@ dependencies = [ "serde_json", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "lexical-core" version = "0.8.5" @@ -1996,7 +2213,20 @@ checksum = "5ff4ae71b685bbad2f2f391fe74f6b7659a34871c08b210fdc039e43bee07d18" dependencies = [ "adler32", "crc32fast", - "libflate_lz77", + "libflate_lz77 1.2.0", +] + +[[package]] +name = "libflate" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" +dependencies = [ + "adler32", + "core2", + "crc32fast", + "dary_heap", + "libflate_lz77 2.1.0", ] [[package]] @@ -2008,6 +2238,17 @@ dependencies = [ "rle-decode-fast", ] +[[package]] +name = "libflate_lz77" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" +dependencies = [ + "core2", + "hashbrown", + "rle-decode-fast", +] + [[package]] name = "libgit2-sys" version = "0.17.0+1.8.1" @@ -2211,7 +2452,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", "wasi", "windows-sys 0.52.0", @@ -2292,6 +2533,7 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", + "serde", ] [[package]] @@ -2509,6 +2751,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "oorandom" +version = "11.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" + [[package]] name = "openssl-probe" version = "0.1.5" @@ -2675,10 +2923,45 @@ dependencies = [ "array-init-cursor", ] +[[package]] +name = "plotters" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a15b6eccb8484002195a3e44fe65a4ce8e93a625797a063735536fd59cb01cf3" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "414cec62c6634ae900ea1c56128dfe87cf63e7caece0852ec76aba307cebadb7" + +[[package]] +name = "plotters-svg" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81b30686a7d9c3e010b84284bdd26a29f2138574f52f5eb6f794fc0ad924e705" +dependencies = [ + "plotters-backend", +] + [[package]] name = "polars" version = "0.43.1" dependencies = [ + "ahash", + "apache-avro", + "arrow-buffer", + "avro-schema", + "either", + "ethnum", + "futures", "getrandom", "polars-arrow", "polars-core", @@ -2691,6 +2974,10 @@ dependencies = [ "polars-sql", "polars-time", "polars-utils", + "proptest", + "rand", + "tokio", + "tokio-util", "version_check", ] @@ -2710,10 +2997,14 @@ dependencies = [ "bytemuck", "chrono", "chrono-tz", + "criterion", + "crossbeam-channel", + "doc-comment", "dyn-clone", "either", "ethnum", "fast-float", + "flate2", "futures", "getrandom", "hashbrown", @@ -2729,13 +3020,20 @@ dependencies = [ "polars-error", "polars-schema", "polars-utils", + "proptest", + "rand", "regex", - "regex-syntax", + "regex-syntax 0.8.4", "ryu", + "sample-arrow2", + "sample-std", + "sample-test", "serde", "simdutf8", "streaming-iterator", "strength_reduce", + "tokio", + "tokio-util", "version_check", "zstd", ] @@ -2762,6 +3060,7 @@ dependencies = [ "polars-arrow", "polars-error", "polars-utils", + "rand", "strength_reduce", "version_check", ] @@ -2772,6 +3071,7 @@ version = "0.43.1" dependencies = [ "ahash", "arrow-array", + "bincode", "bitflags", "bytemuck", "chrono", @@ -2894,6 +3194,7 @@ dependencies = [ "serde_json", "simd-json", "simdutf8", + "tempfile", "tokio", "tokio-util", "url", @@ -2943,6 +3244,7 @@ dependencies = [ "polars-utils", "pyo3", "rayon", + "serde_json", "tokio", "version_check", ] @@ -3025,6 +3327,7 @@ dependencies = [ "polars-compute", "polars-error", "polars-utils", + "rand", "serde", "simdutf8", "snap", @@ -3270,6 +3573,22 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proptest" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c2511913b88df1637da85cc8d96ec8e43a3f8bb8ccb71ee1ac240d6f3df58d" +dependencies = [ + "bitflags", + "lazy_static", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax 0.8.4", + "unarray", +] + [[package]] name = "prost" version = "0.11.9" @@ -3380,6 +3699,12 @@ dependencies = [ "syn 2.0.76", ] +[[package]] +name = "quad-rand" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "658fa1faf7a4cc5f057c9ee5ef560f717ad9d8dc66d975267f709624d6e1ab88" + [[package]] name = "quick-xml" version = "0.36.1" @@ -3390,6 +3715,17 @@ dependencies = [ "serde", ] +[[package]] +name = "quickcheck" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" +dependencies = [ + "env_logger", + "log", + "rand", +] + [[package]] name = "quinn" version = "0.11.3" @@ -3487,6 +3823,25 @@ dependencies = [ "rand", ] +[[package]] +name = "rand_regex" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a9fe2d7d9eeaf3279d1780452a5bbd26b31b27938787ef1c3e930d1e9cfbd" +dependencies = [ + "rand", + "regex-syntax 0.6.29", +] + +[[package]] +name = "rand_xorshift" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +dependencies = [ + "rand_core", +] + [[package]] name = "raw-cpuid" version = "11.1.0" @@ -3580,7 +3935,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax", + "regex-syntax 0.8.4", ] [[package]] @@ -3591,7 +3946,7 @@ checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.4", ] [[package]] @@ -3600,6 +3955,12 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.8.4" @@ -3842,6 +4203,52 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "sample-arrow2" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "502b30097ae5cc57ee8359bb59d8af349db022492de04596119d83f561ab8977" +dependencies = [ + "arrow2", + "sample-std", +] + +[[package]] +name = "sample-std" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "948bd219c6eb2b2ca1e004d8aefa8bbcf12614f60e0139b1758b49f9a94358c8" +dependencies = [ + "casey", + "quickcheck", + "rand", + "rand_regex", + "regex", +] + +[[package]] +name = "sample-test" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8b253ca516416756b09b582e2b7275de8f51f35e5d5711e20712b9377c7d5bf" +dependencies = [ + "quickcheck", + "sample-std", + "sample-test-macros", +] + +[[package]] +name = "sample-test-macros" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc6439a7589bb4581fdadb6391700ce4d26f8bffd34e2a75acb320822e9b5ef" +dependencies = [ + "proc-macro2", + "quote", + "sample-std", + "syn 1.0.109", +] + [[package]] name = "schannel" version = "0.1.23" @@ -3922,6 +4329,15 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde_bytes" +version = "0.11.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "387cc504cb06bb40a96c8e04e951fe01854cf6bc921053c954e4a606d9675c6a" +dependencies = [ + "serde", +] + [[package]] name = "serde_derive" version = "1.0.209" @@ -4252,6 +4668,19 @@ version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +[[package]] +name = "tempfile" +version = "3.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" +dependencies = [ + "cfg-if", + "fastrand", + "once_cell", + "rustix", + "windows-sys 0.59.0", +] + [[package]] name = "thiserror" version = "1.0.63" @@ -4311,6 +4740,16 @@ dependencies = [ "crunchy", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.8.0" @@ -4383,6 +4822,7 @@ checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" dependencies = [ "bytes", "futures-core", + "futures-io", "futures-sink", "pin-project-lite", "tokio", @@ -4496,12 +4936,38 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "typed-builder" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06fbd5b8de54c5f7c91f6fe4cebb949be2125d7758e630bb58b1d831dbce600" +dependencies = [ + "typed-builder-macro", +] + +[[package]] +name = "typed-builder-macro" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.76", +] + [[package]] name = "typenum" version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-bidi" version = "0.3.15" @@ -4580,6 +5046,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ "getrandom", + "serde", ] [[package]] diff --git a/crates/polars-arrow/Cargo.toml b/crates/polars-arrow/Cargo.toml index 4771ca468fa0..5e7e1eebff0a 100644 --- a/crates/polars-arrow/Cargo.toml +++ b/crates/polars-arrow/Cargo.toml @@ -76,6 +76,23 @@ arrow-buffer = { workspace = true, optional = true } arrow-data = { workspace = true, optional = true } arrow-schema = { workspace = true, optional = true } +[dev-dependencies] +criterion = "0.5" +crossbeam-channel = { workspace = true } +doc-comment = "0.3" +flate2 = { workspace = true, default-features = true } +# used to run formal property testing +proptest = { version = "1", default-features = false, features = ["std"] } +# use for flaky testing +rand = { workspace = true } +# use for generating and testing random data samples +sample-arrow2 = "0.17" +sample-std = "0.2" +sample-test = "0.2" +# used to test async readers +tokio = { workspace = true, features = ["macros", "rt", "fs", "io-util"] } +tokio-util = { workspace = true, features = ["compat"] } + [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-compute/Cargo.toml b/crates/polars-compute/Cargo.toml index 04a4ea80a99b..4ade7134ec5e 100644 --- a/crates/polars-compute/Cargo.toml +++ b/crates/polars-compute/Cargo.toml @@ -17,6 +17,9 @@ polars-error = { workspace = true } polars-utils = { workspace = true } strength_reduce = { workspace = true } +[dev-dependencies] +rand = { workspace = true } + [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index 05c51d5d2438..a3f477e84dd6 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -39,6 +39,10 @@ serde_json = { workspace = true, optional = true } thiserror = { workspace = true } xxhash-rust = { workspace = true } +[dev-dependencies] +bincode = { version = "1" } +serde_json = { workspace = true } + [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index 224786e9b02c..ca3d313e08ae 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -54,6 +54,9 @@ zstd = { workspace = true, optional = true } fs4 = { version = "0.9", features = ["sync"], optional = true } home = "0.5.4" +[dev-dependencies] +tempfile = "3" + [features] default = ["decompress"] # support for arrows json parsing diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index 67b0018b44e5..333fdc1211d2 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -31,6 +31,9 @@ pyo3 = { workspace = true, optional = true } rayon = { workspace = true } tokio = { workspace = true, optional = true } +[dev-dependencies] +serde_json = { workspace = true } + [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 48f8ae9b50a2..2f37857c9cd2 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -43,6 +43,9 @@ package = "jsonpath_lib_polars_vendor" optional = true version = "0.0.1" +[dev-dependencies] +rand = { workspace = true, features = ["small_rng"] } + [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-parquet/Cargo.toml b/crates/polars-parquet/Cargo.toml index 8f41a5d35ad1..26a57b22e713 100644 --- a/crates/polars-parquet/Cargo.toml +++ b/crates/polars-parquet/Cargo.toml @@ -42,6 +42,9 @@ zstd = { version = "^0.13", optional = true, default-features = false } xxhash-rust = { version = "0.8", optional = true, features = ["xxh64"] } +[dev-dependencies] +rand = "0.8" + [features] compression = [ "zstd", diff --git a/crates/polars-sql/Cargo.toml b/crates/polars-sql/Cargo.toml index e334ff8d794a..9db54d1c3333 100644 --- a/crates/polars-sql/Cargo.toml +++ b/crates/polars-sql/Cargo.toml @@ -24,6 +24,11 @@ rand = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sqlparser = { workspace = true } +# sqlparser = { git = "https://github.com/sqlparser-rs/sqlparser-rs.git", rev = "ae3b5844c839072c235965fe0d1bddc473dced87" } + +[dev-dependencies] +# to display dataframes in case of test failures +polars-core = { workspace = true, features = ["fmt"] } [features] default = [] diff --git a/crates/polars-time/Cargo.toml b/crates/polars-time/Cargo.toml index 0d405241bd66..d75d634d213d 100644 --- a/crates/polars-time/Cargo.toml +++ b/crates/polars-time/Cargo.toml @@ -24,6 +24,9 @@ once_cell = { workspace = true } regex = { workspace = true } serde = { workspace = true, optional = true } +[dev-dependencies] +polars-ops = { workspace = true, features = ["abs"] } + [features] dtype-date = ["polars-core/dtype-date", "temporal"] dtype-datetime = ["polars-core/dtype-datetime", "temporal"] diff --git a/crates/polars-utils/Cargo.toml b/crates/polars-utils/Cargo.toml index 62c96dc5e4fc..442d319b7753 100644 --- a/crates/polars-utils/Cargo.toml +++ b/crates/polars-utils/Cargo.toml @@ -27,12 +27,12 @@ serde = { workspace = true, optional = true } stacker = { workspace = true } sysinfo = { version = "0.31", default-features = false, features = ["system"], optional = true } -[build-dependencies] -version_check = { workspace = true } - [dev-dependencies] rand = { workspace = true } +[build-dependencies] +version_check = { workspace = true } + [features] mmap = ["memmap"] bigidx = [] diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 05a1c8673f70..b858dbc36678 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -23,6 +23,22 @@ polars-sql = { workspace = true, optional = true } polars-time = { workspace = true, optional = true } polars-utils = { workspace = true } +[dev-dependencies] +ahash = { workspace = true } +apache-avro = { version = "0.17", features = ["snappy"] } +arrow = { workspace = true, features = ["arrow_rs"] } +arrow-buffer = { workspace = true } +avro-schema = { workspace = true, features = ["async"] } +either = { workspace = true } +ethnum = "1" +futures = { workspace = true } +# used to run formal property testing +proptest = { version = "1", default-features = false, features = ["std"] } +rand = { workspace = true } +# used to test async readers +tokio = { workspace = true, features = ["macros", "rt", "fs", "io-util"] } +tokio-util = { workspace = true, features = ["compat"] } + [build-dependencies] version_check = { workspace = true } From 5afc13edcd8d58c29f983d68a9484d483bce7907 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 17:45:39 +0200 Subject: [PATCH 21/42] fix without debug assertions --- crates/polars-core/src/frame/mod.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 0edbfa7a726e..8220c9a6ddfb 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -1236,13 +1236,10 @@ impl DataFrame { /// # Safety /// The caller must ensure `column.len() == self.height()` . pub unsafe fn with_column_unchecked(&mut self, column: Series) -> &mut Self { - #[cfg(debug_assertions)] - { - return self.with_column(column).unwrap(); - } - #[cfg(not(debug_assertions))] - { - self.get_columns_mut().push(column); + if cfg!(debug_assertions) { + self.with_column(column).unwrap() + } else { + self.get_columns_mut().push(column.into_column()); self } } From 7fb53df92f77c7fed3b967cc31b0e6eaca153c33 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 18:11:52 +0200 Subject: [PATCH 22/42] fix miri in CI --- crates/polars-arrow/src/bitmap/bitmask.rs | 4 +- .../polars-core/src/chunked_array/ops/mod.rs | 1 + crates/polars-core/src/frame/column.rs | 52 +++++++++++++ crates/polars-core/src/frame/mod.rs | 74 +++++++++---------- 4 files changed, 92 insertions(+), 39 deletions(-) diff --git a/crates/polars-arrow/src/bitmap/bitmask.rs b/crates/polars-arrow/src/bitmap/bitmask.rs index 67785a49eeda..4d6457c07956 100644 --- a/crates/polars-arrow/src/bitmap/bitmask.rs +++ b/crates/polars-arrow/src/bitmap/bitmask.rs @@ -14,7 +14,7 @@ fn nth_set_bit_u32(w: u32, n: u32) -> Option { // We use this by setting the first argument to 1 << n, which means the // first n-1 zero bits of it will spread to the first n-1 one bits of w, // after which the one bit will exactly get copied to the nth one bit of w. - #[cfg(target_feature = "bmi2")] + #[cfg(all(not(miri), target_feature = "bmi2"))] { if n >= 32 { return None; @@ -28,7 +28,7 @@ fn nth_set_bit_u32(w: u32, n: u32) -> Option { Some(nth_set_bit.trailing_zeros()) } - #[cfg(not(target_feature = "bmi2"))] + #[cfg(any(miri, not(target_feature = "bmi2")))] { // Each block of 2/4/8/16 bits contains how many set bits there are in that block. let set_per_2 = w - ((w >> 1) & 0x55555555); diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index 456ac561a3f1..8da567d06491 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -306,6 +306,7 @@ pub trait ChunkVar { /// fn filter_all_ones(df: &DataFrame) -> PolarsResult { /// let mask = df /// .column("column_a")? +/// .as_materialized_series() /// .equal(1)?; /// /// df.filter(&mask) diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index 0c0f54e4f920..2135d35704ed 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -801,6 +801,58 @@ impl Column { } } +impl ChunkCompare<&Column> for Column { + type Item = PolarsResult; + + /// Create a boolean mask by checking for equality. + #[inline] + fn equal(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series().equal(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking for equality. + #[inline] + fn equal_missing(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series().equal_missing(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking for inequality. + #[inline] + fn not_equal(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series().not_equal(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking for inequality. + #[inline] + fn not_equal_missing(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series().not_equal_missing(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking if self > rhs. + #[inline] + fn gt(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series().gt(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking if self >= rhs. + #[inline] + fn gt_eq(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series().gt_eq(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking if self < rhs. + #[inline] + fn lt(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series().lt(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking if self <= rhs. + #[inline] + fn lt_eq(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series().lt_eq(rhs.as_materialized_series()) + } +} + impl Default for Column { fn default() -> Self { // @scalar-opt diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 8220c9a6ddfb..c058068a3aab 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -121,8 +121,8 @@ where /// /// ```rust /// # use polars_core::prelude::*; -/// let s1 = Series::new("Fruit".into(), ["Apple", "Apple", "Pear"]); -/// let s2 = Series::new("Color".into(), ["Red", "Yellow", "Green"]); +/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]); +/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]); /// /// let df: PolarsResult = DataFrame::new(vec![s1, s2]); /// ``` @@ -151,8 +151,8 @@ where /// let df = df!("Fruit" => ["Apple", "Apple", "Pear"], /// "Color" => ["Red", "Yellow", "Green"])?; /// -/// assert_eq!(df[0], Series::new("Fruit".into(), &["Apple", "Apple", "Pear"])); -/// assert_eq!(df[1], Series::new("Color".into(), &["Red", "Yellow", "Green"])); +/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"])); +/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"])); /// # Ok::<(), PolarsError>(()) /// ``` /// @@ -163,8 +163,8 @@ where /// let df = df!("Fruit" => ["Apple", "Apple", "Pear"], /// "Color" => ["Red", "Yellow", "Green"])?; /// -/// assert_eq!(df["Fruit"], Series::new("Fruit".into(), &["Apple", "Apple", "Pear"])); -/// assert_eq!(df["Color"], Series::new("Color".into(), &["Red", "Yellow", "Green"])); +/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"])); +/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"])); /// # Ok::<(), PolarsError>(()) /// ``` #[derive(Clone)] @@ -276,8 +276,8 @@ impl DataFrame { /// /// ``` /// # use polars_core::prelude::*; - /// let s0 = Series::new("days".into(), [0, 1, 2].as_ref()); - /// let s1 = Series::new("temp".into(), [22.1, 19.9, 7.].as_ref()); + /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref()); + /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref()); /// /// let df = DataFrame::new(vec![s0, s1])?; /// # Ok::<(), PolarsError>(()) @@ -377,8 +377,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s1 = Series::new("Ocean".into(), ["Atlantic", "Indian"]); - /// let s2 = Series::new("Area (km²)".into(), [106_460_000, 70_560_000]); + /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]); + /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]); /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?; /// /// assert_eq!(df.pop(), Some(s2)); @@ -588,7 +588,7 @@ impl DataFrame { /// # use polars_core::prelude::*; /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"], /// "Symbol" => ["A", "C", "G", "T"])?; - /// let columns: &[Series] = df.get_columns(); + /// let columns: &[Column] = df.get_columns(); /// /// assert_eq!(columns[0].name(), "Name"); /// assert_eq!(columns[1].name(), "Symbol"); @@ -619,14 +619,14 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s1: Series = Series::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]); - /// let s2: Series = Series::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]); + /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]); + /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]); /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?; /// /// let mut iterator = df.iter(); /// - /// assert_eq!(iterator.next(), Some(&s1)); - /// assert_eq!(iterator.next(), Some(&s2)); + /// assert_eq!(iterator.next(), Some(s1.as_materialized_series())); + /// assert_eq!(iterator.next(), Some(s2.as_materialized_series())); /// assert_eq!(iterator.next(), None); /// # Ok::<(), PolarsError>(()) /// ``` @@ -837,8 +837,8 @@ impl DataFrame { /// ```rust /// # use polars_core::prelude::*; /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?; - /// let s1: Series = Series::new("Proton".into(), [29, 47, 79]); - /// let s2: Series = Series::new("Electron".into(), [29, 47, 79]); + /// let s1 = Column::new("Proton".into(), [29, 47, 79]); + /// let s2 = Column::new("Electron".into(), [29, 47, 79]); /// /// let df2: DataFrame = df1.hstack(&[s1, s2])?; /// assert_eq!(df2.shape(), (3, 3)); @@ -1047,7 +1047,7 @@ impl DataFrame { /// assert!(s1.is_err()); /// /// let s2: Column = df.drop_in_place("Animal")?; - /// assert_eq!(s2, Column::new_series("Animal".into(), &["Tiger", "Lion", "Great auk"])); + /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"])); /// # Ok::<(), PolarsError>(()) /// ``` pub fn drop_in_place(&mut self, name: &str) -> PolarsResult { @@ -1355,8 +1355,8 @@ impl DataFrame { /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"], /// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?; /// - /// let s1: Option<&Series> = df.select_at_idx(0); - /// let s2: Series = Series::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]); + /// let s1: Option<&Column> = df.select_at_idx(0); + /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]); /// /// assert_eq!(s1, Some(&s2)); /// # Ok::<(), PolarsError>(()) @@ -1468,8 +1468,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s1: Series = Series::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]); - /// let s2: Series = Series::new("Robustness".into(), ["Weak", "Strong"]); + /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]); + /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]); /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?; /// /// assert_eq!(df.column("Password")?, &s1); @@ -1488,7 +1488,7 @@ impl DataFrame { /// # use polars_core::prelude::*; /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"], /// "Max weight (kg)" => [16.0, 35.89])?; - /// let sv: Vec<&Series> = df.columns(["Latin name", "Max weight (kg)"])?; + /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?; /// /// assert_eq!(&df[0], sv[0]); /// assert_eq!(&df[1], sv[1]); @@ -1609,7 +1609,7 @@ impl DataFrame { /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"], /// "Carbon" => [1, 2, 3], /// "Hydrogen" => [4, 6, 8])?; - /// let sv: Vec = df.select_series(["Carbon", "Hydrogen"])?; + /// let sv: Vec = df.select_columns(["Carbon", "Hydrogen"])?; /// /// assert_eq!(df["Carbon"], sv[0]); /// assert_eq!(df["Hydrogen"], sv[1]); @@ -2020,8 +2020,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]); - /// let s1 = Series::new("names".into(), ["Jean", "Claude", "van"]); + /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]); + /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]); /// let mut df = DataFrame::new(vec![s0, s1])?; /// /// fn str_to_len(str_val: &Series) -> Series { @@ -2070,8 +2070,8 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]); - /// let s1 = Series::new("ascii".into(), [70, 79, 79]); + /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]); + /// let s1 = Column::new("ascii".into(), [70, 79, 79]); /// let mut df = DataFrame::new(vec![s0, s1])?; /// /// // Add 32 to get lowercase ascii values @@ -2140,14 +2140,14 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]); - /// let s1 = Series::new("values".into(), [1, 2, 3, 4, 5]); + /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]); + /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]); /// let mut df = DataFrame::new(vec![s0, s1])?; /// /// let idx = vec![0, 1, 4]; /// - /// df.try_apply("foo", |s| { - /// s.str()? + /// df.try_apply("foo", |c| { + /// c.str()? /// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string))) /// }); /// # Ok::<(), PolarsError>(()) @@ -2204,16 +2204,16 @@ impl DataFrame { /// /// ```rust /// # use polars_core::prelude::*; - /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]); - /// let s1 = Series::new("values".into(), [1, 2, 3, 4, 5]); + /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]); + /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]); /// let mut df = DataFrame::new(vec![s0, s1])?; /// /// // create a mask - /// let values = df.column("values")?; + /// let values = df.column("values")?.as_materialized_series(); /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?; /// - /// df.try_apply("foo", |s| { - /// s.str()? + /// df.try_apply("foo", |c| { + /// c.str()? /// .set(&mask, Some("not_within_bounds")) /// }); /// # Ok::<(), PolarsError>(()) From 94d2a7cf2275f11bf3e7c3ce21824dc11657fab4 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 18:12:20 +0200 Subject: [PATCH 23/42] fmt --- crates/polars-core/src/frame/column.rs | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index 2135d35704ed..59d187d59afd 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -807,49 +807,57 @@ impl ChunkCompare<&Column> for Column { /// Create a boolean mask by checking for equality. #[inline] fn equal(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series().equal(rhs.as_materialized_series()) + self.as_materialized_series() + .equal(rhs.as_materialized_series()) } /// Create a boolean mask by checking for equality. #[inline] fn equal_missing(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series().equal_missing(rhs.as_materialized_series()) + self.as_materialized_series() + .equal_missing(rhs.as_materialized_series()) } /// Create a boolean mask by checking for inequality. #[inline] fn not_equal(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series().not_equal(rhs.as_materialized_series()) + self.as_materialized_series() + .not_equal(rhs.as_materialized_series()) } /// Create a boolean mask by checking for inequality. #[inline] fn not_equal_missing(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series().not_equal_missing(rhs.as_materialized_series()) + self.as_materialized_series() + .not_equal_missing(rhs.as_materialized_series()) } /// Create a boolean mask by checking if self > rhs. #[inline] fn gt(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series().gt(rhs.as_materialized_series()) + self.as_materialized_series() + .gt(rhs.as_materialized_series()) } /// Create a boolean mask by checking if self >= rhs. #[inline] fn gt_eq(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series().gt_eq(rhs.as_materialized_series()) + self.as_materialized_series() + .gt_eq(rhs.as_materialized_series()) } /// Create a boolean mask by checking if self < rhs. #[inline] fn lt(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series().lt(rhs.as_materialized_series()) + self.as_materialized_series() + .lt(rhs.as_materialized_series()) } /// Create a boolean mask by checking if self <= rhs. #[inline] fn lt_eq(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series().lt_eq(rhs.as_materialized_series()) + self.as_materialized_series() + .lt_eq(rhs.as_materialized_series()) } } From e8c5088ecdb3ddf3a4027e6d4d884b04c7f1b2bc Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 11 Sep 2024 18:23:48 +0200 Subject: [PATCH 24/42] fix test --- crates/polars-core/src/chunked_array/ndarray.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ndarray.rs b/crates/polars-core/src/chunked_array/ndarray.rs index a3017f0103e6..94889445f845 100644 --- a/crates/polars-core/src/chunked_array/ndarray.rs +++ b/crates/polars-core/src/chunked_array/ndarray.rs @@ -83,8 +83,8 @@ impl DataFrame { /// /// ```rust /// use polars_core::prelude::*; - /// let a = UInt32Chunked::new("a".into(), &[1, 2, 3]).into_series(); - /// let b = Float64Chunked::new("b".into(), &[10., 8., 6.]).into_series(); + /// let a = UInt32Chunked::new("a".into(), &[1, 2, 3]).into_column(); + /// let b = Float64Chunked::new("b".into(), &[10., 8., 6.]).into_column(); /// /// let df = DataFrame::new(vec![a, b]).unwrap(); /// let ndarray = df.to_ndarray::(IndexOrder::Fortran).unwrap(); From 4ae63d59eee6542d5a6e028e923023c33ec8c68d Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Thu, 12 Sep 2024 10:11:23 +0200 Subject: [PATCH 25/42] fix many doc issues --- .../src/chunked_array/ops/fill_null.rs | 2 +- crates/polars-core/src/frame/column.rs | 51 ++++++++++++++++--- crates/polars-core/src/frame/horizontal.rs | 2 +- crates/polars-core/src/frame/mod.rs | 6 +-- crates/polars-core/src/scalar/mod.rs | 1 + crates/polars-io/src/ipc/ipc_file.rs | 4 +- crates/polars-io/src/ipc/ipc_stream.rs | 6 +-- crates/polars-lazy/src/lib.rs | 6 +-- crates/polars/src/docs/eager.rs | 45 ++++++++++------ crates/polars/src/docs/lazy.rs | 6 +-- 10 files changed, 90 insertions(+), 39 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ops/fill_null.rs b/crates/polars-core/src/chunked_array/ops/fill_null.rs index 7aa348d5e440..377b51afe134 100644 --- a/crates/polars-core/src/chunked_array/ops/fill_null.rs +++ b/crates/polars-core/src/chunked_array/ops/fill_null.rs @@ -30,7 +30,7 @@ impl Series { /// ```rust /// # use polars_core::prelude::*; /// fn example() -> PolarsResult<()> { - /// let s = Series::new("some_missing".into(), &[Some(1), None, Some(2)]); + /// let s = Column::new("some_missing".into(), &[Some(1), None, Some(2)]); /// /// let filled = s.fill_null(FillNullStrategy::Forward(None))?; /// assert_eq!(Vec::from(filled.i32()?), &[Some(1), Some(1), Some(2)]); diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index 59d187d59afd..2a07af755a66 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -10,6 +10,15 @@ use crate::chunked_array::metadata::MetadataFlags; use crate::prelude::*; use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; +/// A column within a [`DataFrame`]. +/// +/// This is lazily initialized to a [`Series`] with methods like +/// [`as_materialized_series`][Column::as_materialized_series] and +/// [`take_materialized_series`][Column::take_materialized_series]. +/// +/// Currently, there are two ways to represent a [`Column`]. +/// 1. A [`Series`] of values +/// 2. A [`ScalarColumn`] that repeats a single [`Scalar`] #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] #[cfg_attr(feature = "serde", serde(from = "Series"))] @@ -19,14 +28,23 @@ pub enum Column { Scalar(ScalarColumn), } +/// A column #[derive(Debug, Clone)] pub struct ScalarColumn { name: PlSmallStr, value: Scalar, - materialized: OnceLock, length: usize, + + // invariants: + // materialized.name() == name + // materialized.len() == length + // materialized.dtype() == value.dtype + // materialized[i] == value, for all 0 <= i < length + /// A lazily materialized [`Series`] variant of this [`ScalarColumn`] + materialized: OnceLock, } +/// Convert `Self` into a [`Column`] pub trait IntoColumn: Sized { fn into_column(self) -> Column; } @@ -52,6 +70,10 @@ impl Column { Self::Scalar(ScalarColumn::new(name, value, length)) } + // # Materialize + /// Get a reference to a [`Series`] for this [`Column`] + /// + /// This may need to materialize the [`Series`] on the first invocation for a specific column. #[inline] pub fn as_materialized_series(&self) -> &Series { match self { @@ -59,9 +81,11 @@ impl Column { Column::Scalar(s) => s.as_materialized_series(), } } - + /// Turn [`Column`] into a [`Column::Series`]. + /// + /// This may need to materialize the [`Series`] on the first invocation for a specific column. #[inline] - pub fn as_materialized_series_mut(&mut self) -> &mut Series { + pub fn into_materialized_series(&mut self) -> &mut Series { match self { Column::Series(s) => s, Column::Scalar(s) => { @@ -73,7 +97,9 @@ impl Column { }, } } - + /// Take [`Series`] from a [`Column`] + /// + /// This may need to materialize the [`Series`] on the first invocation for a specific column. #[inline] pub fn take_materialized_series(self) -> Series { match self { @@ -101,6 +127,7 @@ impl Column { } } + // # Downcasting #[inline] pub fn as_series(&self) -> Option<&Series> { match self { @@ -108,7 +135,6 @@ impl Column { Column::Scalar(_) => None, } } - #[inline] pub fn as_scalar_column(&self) -> Option<&ScalarColumn> { match self { @@ -482,7 +508,7 @@ impl Column { pub fn append(&mut self, other: &Column) -> PolarsResult<&mut Self> { // @scalar-opt - self.as_materialized_series_mut() + self.into_materialized_series() .append(other.as_materialized_series())?; Ok(self) } @@ -537,7 +563,7 @@ impl Column { pub fn extend(&mut self, other: &Column) -> PolarsResult<&mut Self> { // @scalar-opt - self.as_materialized_series_mut() + self.into_materialized_series() .extend(other.as_materialized_series())?; Ok(self) } @@ -1096,10 +1122,14 @@ impl ScalarColumn { Self::_to_series(self.name.clone(), self.value.clone(), self.length) } + /// Get the [`ScalarColumn`] as [`Series`] + /// + /// This needs to materialize upon the first call. Afterwards, this is cached. pub fn as_materialized_series(&self) -> &Series { self.materialized.get_or_init(|| self.to_series()) } + /// Take the [`ScalarColumn`] and materialize as a [`Series`] if not already done. pub fn take_materialized_series(self) -> Series { self.materialized .into_inner() @@ -1126,6 +1156,13 @@ impl IntoColumn for Column { } } +impl IntoColumn for ScalarColumn { + #[inline(always)] + fn into_column(self) -> Column { + self.into() + } +} + /// We don't want to serialize the scalar columns. So this helps pretend that columns are always /// initialized without implementing From for Series. /// diff --git a/crates/polars-core/src/frame/horizontal.rs b/crates/polars-core/src/frame/horizontal.rs index cf65b807f3e5..31c072991d87 100644 --- a/crates/polars-core/src/frame/horizontal.rs +++ b/crates/polars-core/src/frame/horizontal.rs @@ -86,7 +86,7 @@ pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> Polars let diff = max_len - df.height(); df.columns.iter_mut().for_each(|s| { // @scalar-opt - let s = s.as_materialized_series_mut(); + let s = s.into_materialized_series(); *s = s.extend_constant(AnyValue::Null, diff).unwrap() }); } diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index c058068a3aab..2bd2ff576da5 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -971,7 +971,7 @@ impl DataFrame { .zip(other.columns.iter()) .try_for_each::<_, PolarsResult<_>>(|(left, right)| { // @scalar-opt - let left = left.as_materialized_series_mut(); + let left = left.into_materialized_series(); let right = right.as_materialized_series(); ensure_can_extend(&*left, right)?; @@ -993,7 +993,7 @@ impl DataFrame { .zip(other.columns.iter()) .for_each(|(left, right)| { // @scalar-opt - let left = left.as_materialized_series_mut(); + let left = left.into_materialized_series(); let right = right.as_materialized_series(); left.append(right).expect("should not fail"); @@ -1025,7 +1025,7 @@ impl DataFrame { .iter_mut() .zip(other.columns.iter()) .try_for_each::<_, PolarsResult<_>>(|(left, right)| { - let left = left.as_materialized_series_mut(); + let left = left.into_materialized_series(); let right = right.as_materialized_series(); ensure_can_extend(&*left, right)?; diff --git a/crates/polars-core/src/scalar/mod.rs b/crates/polars-core/src/scalar/mod.rs index d0948e8ff761..22eef608e8c7 100644 --- a/crates/polars-core/src/scalar/mod.rs +++ b/crates/polars-core/src/scalar/mod.rs @@ -45,6 +45,7 @@ impl Scalar { Series::from_any_values_and_dtype(name, &[self.as_any_value()], &self.dtype, true).unwrap() } + /// Turn a scalar into a column with `length=1`. pub fn into_column(self, name: PlSmallStr) -> Column { // @scalar-opt Series::from_any_values_and_dtype(name, &[self.as_any_value()], &self.dtype, true) diff --git a/crates/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs index feaea44f5417..64598ca8c848 100644 --- a/crates/polars-io/src/ipc/ipc_file.rs +++ b/crates/polars-io/src/ipc/ipc_file.rs @@ -12,8 +12,8 @@ //! use std::io::Cursor; //! //! -//! let s0 = Series::new("days".into(), &[0, 1, 2, 3, 4]); -//! let s1 = Series::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); +//! let s0 = Column::new("days".into(), &[0, 1, 2, 3, 4]); +//! let s1 = Column::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); //! let mut df = DataFrame::new(vec![s0, s1]).unwrap(); //! //! // Create an in memory file handler. diff --git a/crates/polars-io/src/ipc/ipc_stream.rs b/crates/polars-io/src/ipc/ipc_stream.rs index 545f19168f9f..6b16579ac93d 100644 --- a/crates/polars-io/src/ipc/ipc_stream.rs +++ b/crates/polars-io/src/ipc/ipc_stream.rs @@ -13,9 +13,9 @@ //! use std::io::Cursor; //! //! -//! let s0 = Series::new("days".into(), &[0, 1, 2, 3, 4]); -//! let s1 = Series::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); -//! let mut df = DataFrame::new(vec![s0, s1]).unwrap(); +//! let c0 = Column::new("days".into(), &[0, 1, 2, 3, 4]); +//! let c1 = Column::new("temp".into(), &[22.1, 19.9, 7., 2., 3.]); +//! let mut df = DataFrame::new(vec![c0, c1]).unwrap(); //! //! // Create an in memory file handler. //! // Vec: Read + Write diff --git a/crates/polars-lazy/src/lib.rs b/crates/polars-lazy/src/lib.rs index 005a09186ba2..3059384a1c8c 100644 --- a/crates/polars-lazy/src/lib.rs +++ b/crates/polars-lazy/src/lib.rs @@ -61,7 +61,7 @@ //! assert!(new.column("new_column") //! .unwrap() //! .equals( -//! &Series::new("new_column".into(), &[50, 40, 30, 20, 10]) +//! &Column::new("new_column".into(), &[50, 40, 30, 20, 10]) //! ) //! ); //! ``` @@ -94,7 +94,7 @@ //! assert!(new.column("new_column") //! .unwrap() //! .equals( -//! &Series::new("new_column".into(), &[100, 100, 3, 4, 5]) +//! &Column::new("new_column".into(), &[100, 100, 3, 4, 5]) //! ) //! ); //! ``` @@ -147,7 +147,7 @@ //! col("column_a") //! // apply a custom closure Series => Result //! .map(|_s| { -//! Ok(Some(Series::new("".into(), &[6.0f32, 6.0, 6.0, 6.0, 6.0]))) +//! Ok(Some(Column::new("".into(), &[6.0f32, 6.0, 6.0, 6.0, 6.0]))) //! }, //! // return type of the closure //! GetOutput::from_type(DataType::Float64)).alias("new_column") diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs index 1285f2e2296a..6d3a6e90ea4c 100644 --- a/crates/polars/src/docs/eager.rs +++ b/crates/polars/src/docs/eager.rs @@ -72,6 +72,9 @@ //! // from a chunked-array //! let ca = UInt32Chunked::new("foo".into(), &[Some(1), None, Some(3)]); //! let s = ca.into_series(); +//! +//! // into a Column +//! let s = s.into_column(); //! ``` //! //! ### DataFrame @@ -88,10 +91,10 @@ //! "values_nulls" => [Some(1), None, Some(3)] //! ]?; //! -//! // from a Vec -//! let s1 = Series::new("names".into(), &["a", "b", "c"]); -//! let s2 = Series::new("values".into(), &[Some(1), None, Some(3)]); -//! let df = DataFrame::new(vec![s1, s2])?; +//! // from a Vec +//! let c1 = Column::new("names".into(), &["a", "b", "c"]); +//! let c2 = Column::new("values".into(), &[Some(1), None, Some(3)]); +//! let df = DataFrame::new(vec![c1, c2])?; //! # Ok(()) //! # } //! ``` @@ -251,12 +254,12 @@ //! # fn example() -> PolarsResult<()> { //! //! // apply a closure over all values -//! let s = Column::new("foo".into(), &[Some(1), Some(2), None]); +//! let s = Series::new("foo".into(), &[Some(1), Some(2), None]); //! s.i32()?.apply_values(|value| value * 20); //! //! // count string lengths -//! let s = Column::new("foo".into(), &["foo", "bar", "foobar"]); -//! unary_elementwise_values(s.str()?, |str_val| str_val.len() as u64); +//! let s = Series::new("foo".into(), &["foo", "bar", "foobar"]); +//! unary_elementwise_values::(s.str()?, |str_val| str_val.len() as u64); //! //! # Ok(()) //! # } @@ -354,9 +357,14 @@ //! // ordering of the columns //! let descending = vec![true, false]; //! // columns to sort by -//! let by = &["b", "a"]; +//! let by = [PlSmallStr::from_static("b"), PlSmallStr::from_static("a")]; //! // do the sort operation -//! let sorted = df.sort(by, descending, true)?; +//! let sorted = df.sort( +//! by, +//! SortMultipleOptions::default() +//! .with_order_descending_multi(descending) +//! .with_maintain_order(true) +//! )?; //! //! // sorted: //! @@ -442,7 +450,14 @@ //! )?; //! //! // group_by "foo" | pivot "bar" column | aggregate "N" -//! let pivoted = pivot::pivot(&df, ["foo"], ["bar"], ["N"], false, Some(first()), None); +//! let pivoted = pivot::pivot( +//! &df, +//! [PlSmallStr::from_static("foo")], +//! Some([PlSmallStr::from_static("bar")]), +//! Some([PlSmallStr::from_static("N")]), +//! false, Some(first()), +//! None +//! ); //! //! // pivoted: //! // +-----+------+------+------+------+------+ @@ -475,8 +490,8 @@ //! ]?; //! //! let unpivoted = df.unpivot( -//! &[PlSmallStr::from_static("A"), PlSmallStr::from_static("B")], -//! &[PlSmallStr::from_static("C"), PlSmallStr::from_static("D")], +//! [PlSmallStr::from_static("A"), PlSmallStr::from_static("B")], +//! [PlSmallStr::from_static("C"), PlSmallStr::from_static("D")], //! ).unwrap(); //! // unpivoted: //! @@ -560,10 +575,8 @@ //! //! # fn example(df: &DataFrame) -> PolarsResult<()> { //! // read from path -//! let df = CsvReader::from_path("iris_csv")? -//! .infer_schema(None) -//! .has_header(true) -//! .finish()?; +//! let mut file = std::fs::File::open("iris_csv")?; +//! let df = CsvReader::new(file).finish()?; //! # Ok(()) //! # } //! ``` diff --git a/crates/polars/src/docs/lazy.rs b/crates/polars/src/docs/lazy.rs index 82a093e6c3ab..c77bf58d5cac 100644 --- a/crates/polars/src/docs/lazy.rs +++ b/crates/polars/src/docs/lazy.rs @@ -30,7 +30,7 @@ //! //! // scan a csv file lazily //! let lf: LazyFrame = LazyCsvReader::new("some_path") -//! .has_header(true) +//! .with_has_header(true) //! .finish()?; //! //! // scan a parquet file lazily @@ -82,7 +82,7 @@ //! // sort this DataFrame by multiple columns //! //! let sorted = df.lazy() -//! .sort_by_exprs(vec![col("b"), col("a")]) +//! .sort_by_exprs(vec![col("b"), col("a")], SortMultipleOptions::default()) //! .collect()?; //! //! // sorted: @@ -110,7 +110,7 @@ //! # fn example() -> PolarsResult<()> { //! //! let df = LazyCsvReader::new("reddit.csv") -//! .has_header(true) +//! .with_has_header(true) //! .with_separator(b',') //! .finish()? //! .group_by([col("comment_karma")]) From 5b2c6aa53a2ae3e7dd72a99570fe10fc86407fc1 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Thu, 12 Sep 2024 11:32:30 +0200 Subject: [PATCH 26/42] add docs and actually create scalar columns --- .../chunked_array/builder/binary_offset.rs | 51 ++++ .../src/chunked_array/builder/mod.rs | 2 + crates/polars-core/src/datatypes/any_value.rs | 12 +- crates/polars-core/src/frame/column.rs | 248 +++++++++--------- crates/polars-core/src/frame/mod.rs | 3 +- crates/polars-core/src/scalar/mod.rs | 7 +- crates/polars-core/src/series/any_value.rs | 39 +++ crates/polars-core/src/utils/mod.rs | 3 +- crates/polars-lazy/src/dsl/eval.rs | 2 +- 9 files changed, 230 insertions(+), 137 deletions(-) create mode 100644 crates/polars-core/src/chunked_array/builder/binary_offset.rs diff --git a/crates/polars-core/src/chunked_array/builder/binary_offset.rs b/crates/polars-core/src/chunked_array/builder/binary_offset.rs new file mode 100644 index 000000000000..c7d084294528 --- /dev/null +++ b/crates/polars-core/src/chunked_array/builder/binary_offset.rs @@ -0,0 +1,51 @@ +use super::*; + +pub struct BinaryOffsetChunkedBuilder { + pub(crate) chunk_builder: MutableBinaryArray, + pub(crate) field: FieldRef, +} + +impl Clone for BinaryOffsetChunkedBuilder { + fn clone(&self) -> Self { + Self { + chunk_builder: self.chunk_builder.clone(), + field: self.field.clone(), + } + } +} + +impl BinaryOffsetChunkedBuilder { + /// Create a new [`BinaryOffsetChunkedBuilder`] + /// + /// # Arguments + /// + /// * `capacity` - Number of string elements in the final array. + pub fn new(name: PlSmallStr, capacity: usize) -> Self { + Self { + chunk_builder: MutableBinaryArray::with_capacity(capacity), + field: Arc::new(Field::new(name, DataType::BinaryOffset)), + } + } + + /// Appends a value of type `T` into the builder + #[inline] + pub fn append_value(&mut self, v: &[u8]) { + self.chunk_builder.push(Some(v)); + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) { + self.chunk_builder.push_null() + } + + #[inline] + pub fn append_option(&mut self, opt: Option<&[u8]>) { + self.chunk_builder.push(opt); + } + + pub fn finish(mut self) -> BinaryOffsetChunked { + let arr = self.chunk_builder.as_box(); + ChunkedArray::new_with_compute_len(self.field, vec![arr]) + } +} diff --git a/crates/polars-core/src/chunked_array/builder/mod.rs b/crates/polars-core/src/chunked_array/builder/mod.rs index 539586c2193e..e818254d581b 100644 --- a/crates/polars-core/src/chunked_array/builder/mod.rs +++ b/crates/polars-core/src/chunked_array/builder/mod.rs @@ -1,3 +1,4 @@ +mod binary_offset; mod boolean; #[cfg(feature = "dtype-array")] pub mod fixed_size_list; @@ -10,6 +11,7 @@ use std::sync::Arc; use arrow::array::*; use arrow::bitmap::Bitmap; +pub use binary_offset::*; pub use boolean::*; #[cfg(feature = "dtype-array")] pub(crate) use fixed_size_list::*; diff --git a/crates/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs index 5721ee2db2a9..81adc0d7f5d2 100644 --- a/crates/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -912,7 +912,7 @@ impl<'a> AnyValue<'a> { Object(v) => ObjectOwned(OwnedObject(v.to_boxed())), #[cfg(feature = "dtype-struct")] Struct(idx, arr, fields) => { - let avs = struct_to_avs_static(idx, arr, fields); + let avs = struct_to_avs_static(idx, arr, fields)?; StructOwned(Box::new((avs, fields.to_vec()))) }, #[cfg(feature = "dtype-struct")] @@ -1224,7 +1224,11 @@ impl TotalEq for AnyValue<'_> { } #[cfg(feature = "dtype-struct")] -fn struct_to_avs_static(idx: usize, arr: &StructArray, fields: &[Field]) -> Vec> { +fn struct_to_avs_static( + idx: usize, + arr: &StructArray, + fields: &[Field], +) -> PolarsResult>> { let arrs = arr.values(); let mut avs = Vec::with_capacity(arrs.len()); // amortize loop counter @@ -1233,10 +1237,10 @@ fn struct_to_avs_static(idx: usize, arr: &StructArray, fields: &[Field]) -> Vec< let arr = &**arrs.get_unchecked_release(i); let field = fields.get_unchecked_release(i); let av = arr_to_any_value(arr, idx, &field.dtype); - avs.push_unchecked(av.into_static().unwrap()); + avs.push_unchecked(av.into_static()?); } } - avs + Ok(avs) } #[cfg(feature = "dtype-categorical")] diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs index 2a07af755a66..8f5b87d9c9da 100644 --- a/crates/polars-core/src/frame/column.rs +++ b/crates/polars-core/src/frame/column.rs @@ -9,6 +9,7 @@ use polars_utils::pl_str::PlSmallStr; use crate::chunked_array::metadata::MetadataFlags; use crate::prelude::*; use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; +use crate::utils::Container; /// A column within a [`DataFrame`]. /// @@ -28,7 +29,9 @@ pub enum Column { Scalar(ScalarColumn), } -/// A column +/// A [`Column`] that consists of a repeated [`Scalar`] +/// +/// This is lazily materialized into a [`Series`]. #[derive(Debug, Clone)] pub struct ScalarColumn { name: PlSmallStr, @@ -61,8 +64,7 @@ impl Column { #[inline] pub fn new_empty(name: PlSmallStr, dtype: &DataType) -> Self { - // @scalar-opt - Self::Series(Series::new_empty(name, dtype)) + Self::new_scalar(name, Scalar::new(dtype.clone(), AnyValue::Null), 0) } #[inline] @@ -127,6 +129,42 @@ impl Column { } } + #[inline] + pub fn name(&self) -> &PlSmallStr { + match self { + Column::Series(s) => s.name(), + Column::Scalar(s) => &s.name, + } + } + + #[inline] + pub fn len(&self) -> usize { + match self { + Column::Series(s) => s.len(), + Column::Scalar(s) => s.length, + } + } + + #[inline] + pub fn with_name(mut self, name: PlSmallStr) -> Column { + self.rename(name); + self + } + + #[inline] + pub fn rename(&mut self, name: PlSmallStr) { + match self { + Column::Series(s) => _ = s.rename(name), + Column::Scalar(s) => { + if let Some(series) = s.materialized.get_mut() { + series.rename(name.clone()); + } + + s.name = name; + }, + } + } + // # Downcasting #[inline] pub fn as_series(&self) -> Option<&Series> { @@ -143,79 +181,109 @@ impl Column { } } + // # To Chunked Arrays + pub fn bool(&self) -> PolarsResult<&BooleanChunked> { + // @scalar-opt + self.as_materialized_series().bool() + } pub fn i8(&self) -> PolarsResult<&Int8Chunked> { // @scalar-opt self.as_materialized_series().i8() } - pub fn i16(&self) -> PolarsResult<&Int16Chunked> { // @scalar-opt self.as_materialized_series().i16() } - pub fn i32(&self) -> PolarsResult<&Int32Chunked> { // @scalar-opt self.as_materialized_series().i32() } - pub fn i64(&self) -> PolarsResult<&Int64Chunked> { // @scalar-opt self.as_materialized_series().i64() } - pub fn u8(&self) -> PolarsResult<&UInt8Chunked> { // @scalar-opt self.as_materialized_series().u8() } - pub fn u16(&self) -> PolarsResult<&UInt16Chunked> { // @scalar-opt self.as_materialized_series().u16() } - pub fn u32(&self) -> PolarsResult<&UInt32Chunked> { // @scalar-opt self.as_materialized_series().u32() } - pub fn u64(&self) -> PolarsResult<&UInt64Chunked> { // @scalar-opt self.as_materialized_series().u64() } - pub fn f32(&self) -> PolarsResult<&Float32Chunked> { // @scalar-opt self.as_materialized_series().f32() } - pub fn f64(&self) -> PolarsResult<&Float64Chunked> { // @scalar-opt self.as_materialized_series().f64() } - pub fn str(&self) -> PolarsResult<&StringChunked> { // @scalar-opt self.as_materialized_series().str() } - + pub fn list(&self) -> PolarsResult<&ListChunked> { + // @scalar-opt + self.as_materialized_series().list() + } + pub fn binary(&self) -> PolarsResult<&BinaryChunked> { + // @scalar-opt + self.as_materialized_series().binary() + } + pub fn idx(&self) -> PolarsResult<&IdxCa> { + // @scalar-opt + self.as_materialized_series().idx() + } #[cfg(feature = "dtype-datetime")] pub fn datetime(&self) -> PolarsResult<&DatetimeChunked> { // @scalar-opt self.as_materialized_series().datetime() } + #[cfg(feature = "dtype-struct")] + pub fn struct_(&self) -> PolarsResult<&StructChunked> { + // @scalar-opt + self.as_materialized_series().struct_() + } + #[cfg(feature = "dtype-decimal")] + pub fn decimal(&self) -> PolarsResult<&DecimalChunked> { + // @scalar-opt + self.as_materialized_series().decimal() + } + #[cfg(feature = "dtype-array")] + pub fn array(&self) -> PolarsResult<&ArrayChunked> { + // @scalar-opt + self.as_materialized_series().array() + } + #[cfg(feature = "dtype-categorical")] + pub fn categorical(&self) -> PolarsResult<&CategoricalChunked> { + self.as_materialized_series().categorical() + } - #[inline] - pub fn rename(&mut self, name: PlSmallStr) { - match self { - Column::Series(s) => _ = s.rename(name), - Column::Scalar(s) => { - if let Some(series) = s.materialized.get_mut() { - series.rename(name.clone()); - } - - s.name = name; - }, - } + // # Casting + pub fn strict_cast(&self, dtype: &DataType) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .strict_cast(dtype) + .map(Column::from) + } + pub fn cast(&self, dtype: &DataType) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().cast(dtype).map(Column::from) + } + /// # Safety + /// + /// This can lead to invalid memory access in downstream code. + pub unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult { + // @scalar-opt + unsafe { self.as_materialized_series().cast_unchecked(dtype) }.map(Column::from) } pub fn clear(&self) -> Self { @@ -239,22 +307,6 @@ impl Column { Self::Series(self.as_materialized_series().new_from_index(index, length)) } - #[inline] - pub fn len(&self) -> usize { - match self { - Column::Series(s) => s.len(), - Column::Scalar(s) => s.length, - } - } - - #[inline] - pub fn name(&self) -> &PlSmallStr { - match self { - Column::Series(s) => s.name(), - Column::Scalar(s) => &s.name, - } - } - pub fn has_nulls(&self) -> bool { // @scalar-opt self.as_materialized_series().has_nulls() @@ -494,18 +546,6 @@ impl Column { } } - #[cfg(feature = "dtype-categorical")] - pub fn categorical(&self) -> PolarsResult<&CategoricalChunked> { - self.as_materialized_series().categorical() - } - - pub fn with_name(self, name: PlSmallStr) -> Column { - match self { - Column::Series(s) => s.with_name(name).into(), - Column::Scalar(s) => s.with_name(name).into(), - } - } - pub fn append(&mut self, other: &Column) -> PolarsResult<&mut Self> { // @scalar-opt self.into_materialized_series() @@ -518,40 +558,14 @@ impl Column { self.as_materialized_series().arg_sort(options) } - pub fn cast(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().cast(dtype).map(Column::from) - } - - pub fn idx(&self) -> PolarsResult<&IdxCa> { - // @scalar-opt - self.as_materialized_series().idx() - } - - pub fn binary(&self) -> PolarsResult<&BinaryChunked> { - // @scalar-opt - self.as_materialized_series().binary() - } - pub fn bit_repr(&self) -> Option { // @scalar-opt self.as_materialized_series().bit_repr() } - pub fn bool(&self) -> PolarsResult<&BooleanChunked> { - // @scalar-opt - self.as_materialized_series().bool() - } - - #[cfg(feature = "dtype-struct")] - pub fn struct_(&self) -> PolarsResult<&StructChunked> { - // @scalar-opt - self.as_materialized_series().struct_() - } - - pub fn into_frame(&self) -> DataFrame { - // @scalar-opt - self.as_materialized_series().clone().into_frame() + pub fn into_frame(self) -> DataFrame { + // SAFETY: A single-column dataframe cannot have length mismatches or duplicate names + unsafe { DataFrame::new_no_checks(vec![self]) } } pub fn unique_stable(&self) -> PolarsResult { @@ -599,21 +613,6 @@ impl Column { self.as_materialized_series().shift(periods).into() } - pub fn strict_cast(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .strict_cast(dtype) - .map(Column::from) - } - - /// # Safety - /// - /// This can lead to invalid memory access in downstream code. - pub unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - unsafe { self.as_materialized_series().cast_unchecked(dtype) }.map(Column::from) - } - #[cfg(feature = "zip_with")] pub fn zip_with_same_type( &self, @@ -641,12 +640,6 @@ impl Column { self.as_materialized_series().get(index) } - #[cfg(feature = "dtype-decimal")] - pub fn decimal(&self) -> PolarsResult<&DecimalChunked> { - // @scalar-opt - self.as_materialized_series().decimal() - } - pub fn unique(&self) -> PolarsResult { // @scalar-opt self.as_materialized_series().unique().map(Column::from) @@ -723,17 +716,6 @@ impl Column { .map(Self::from) } - #[cfg(feature = "dtype-array")] - pub fn array(&self) -> PolarsResult<&ArrayChunked> { - // @scalar-opt - self.as_materialized_series().array() - } - - pub fn list(&self) -> PolarsResult<&ListChunked> { - // @scalar-opt - self.as_materialized_series().list() - } - pub fn is_null(&self) -> BooleanChunked { // @scalar-opt self.as_materialized_series().is_null() @@ -1108,14 +1090,23 @@ impl ScalarColumn { Self { name, value, - materialized: OnceLock::new(), length, + + materialized: OnceLock::new(), } } fn _to_series(name: PlSmallStr, value: Scalar, length: usize) -> Series { - // @TODO: There is probably a better way to do this. - value.into_series(name).new_from_index(0, length) + let series = if length == 0 { + Series::new_empty(name, value.dtype()) + } else { + // @TODO: There is probably a better way to do this. + value.into_series(name).new_from_index(0, length) + }; + + debug_assert_eq!(series.len(), length); + + series } pub fn to_series(&self) -> Series { @@ -1135,17 +1126,26 @@ impl ScalarColumn { .into_inner() .unwrap_or_else(|| Self::_to_series(self.name, self.value, self.length)) } - - fn with_name(self, name: PlSmallStr) -> Self { - // @TODO: Keep materialized somehow? - Self::new(name, self.value, self.length) - } } impl IntoColumn for T { #[inline] fn into_column(self) -> Column { - Column::from(self.into_series()) + let series = self.into_series(); + + if series.len() == 1 { + // SAFETY: We just did the bounds check + let value = unsafe { series.get_unchecked(0) }; + + if let Ok(value) = value.into_static() { + let value = Scalar::new(series.dtype().clone(), value); + let mut col = ScalarColumn::new(series.name().clone(), value, 1); + col.materialized = OnceLock::from(series); + return Column::Scalar(col); + } + } + + Column::Series(series) } } diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 2bd2ff576da5..567b2c046a34 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -722,7 +722,8 @@ impl DataFrame { pub fn n_chunks(&self) -> usize { // @scalar-correctness? match self.first_series_column() { - None => 0, + None if self.columns.is_empty() => 0, + None => 1, Some(s) => s.n_chunks(), } } diff --git a/crates/polars-core/src/scalar/mod.rs b/crates/polars-core/src/scalar/mod.rs index 22eef608e8c7..2e762eeb7a32 100644 --- a/crates/polars-core/src/scalar/mod.rs +++ b/crates/polars-core/src/scalar/mod.rs @@ -5,7 +5,7 @@ use polars_utils::pl_str::PlSmallStr; use serde::{Deserialize, Serialize}; use crate::datatypes::{AnyValue, DataType}; -use crate::prelude::{Column, IntoColumn, Series}; +use crate::prelude::{Column, Series}; #[derive(Clone, Debug, PartialEq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] @@ -47,10 +47,7 @@ impl Scalar { /// Turn a scalar into a column with `length=1`. pub fn into_column(self, name: PlSmallStr) -> Column { - // @scalar-opt - Series::from_any_values_and_dtype(name, &[self.as_any_value()], &self.dtype, true) - .unwrap() - .into_column() + Column::new_scalar(name, self, 1) } #[inline(always)] diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index aaa4bc753443..3e9310427034 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -2,6 +2,7 @@ use std::fmt::Write; use arrow::bitmap::MutableBitmap; +use crate::chunked_array::builder::BinaryOffsetChunkedBuilder; #[cfg(feature = "dtype-categorical")] use crate::chunked_array::cast::CastOptions; #[cfg(feature = "object")] @@ -163,6 +164,7 @@ impl Series { #[cfg(feature = "object")] DataType::Object(_, registry) => any_values_to_object(values, registry)?, DataType::Null => Series::new_null(PlSmallStr::EMPTY, values.len()), + DataType::BinaryOffset => any_values_to_binary_offset(values, strict)?.into_series(), dt => { polars_bail!( InvalidOperation: @@ -344,6 +346,43 @@ fn any_values_to_binary(values: &[AnyValue], strict: bool) -> PolarsResult PolarsResult { + fn any_values_to_binary_offset_strict( + values: &[AnyValue], + ) -> PolarsResult { + let mut builder = BinaryOffsetChunkedBuilder::new(PlSmallStr::EMPTY, values.len()); + for av in values { + match av { + AnyValue::Binary(s) => builder.append_value(*s), + AnyValue::BinaryOwned(s) => builder.append_value(&**s), + AnyValue::Null => builder.append_null(), + av => return Err(invalid_value_error(&DataType::Binary, av)), + } + } + Ok(builder.finish()) + } + fn any_values_to_binary_offset_nonstrict(values: &[AnyValue]) -> BinaryOffsetChunked { + values + .iter() + .map(|av| match av { + AnyValue::Binary(b) => Some(*b), + AnyValue::BinaryOwned(b) => Some(&**b), + AnyValue::String(s) => Some(s.as_bytes()), + AnyValue::StringOwned(s) => Some(s.as_str().as_bytes()), + _ => None, + }) + .collect_trusted() + } + if strict { + any_values_to_binary_offset_strict(values) + } else { + Ok(any_values_to_binary_offset_nonstrict(values)) + } +} + #[cfg(feature = "dtype-date")] fn any_values_to_date(values: &[AnyValue], strict: bool) -> PolarsResult { let mut builder = PrimitiveChunkedBuilder::::new(PlSmallStr::EMPTY, values.len()); diff --git a/crates/polars-core/src/utils/mod.rs b/crates/polars-core/src/utils/mod.rs index fc5c77a65ad6..4773a9dd0d20 100644 --- a/crates/polars-core/src/utils/mod.rs +++ b/crates/polars-core/src/utils/mod.rs @@ -142,8 +142,7 @@ impl Container for DataFrame { fn chunk_lengths(&self) -> impl Iterator { // @scalar-correctness? - // This should return a option - self.first_series_column().unwrap().chunk_lengths() + self.columns[0].as_materialized_series().chunk_lengths() } } diff --git a/crates/polars-lazy/src/dsl/eval.rs b/crates/polars-lazy/src/dsl/eval.rs index 62d4a446b7f7..dcb4853f0671 100644 --- a/crates/polars-lazy/src/dsl/eval.rs +++ b/crates/polars-lazy/src/dsl/eval.rs @@ -81,7 +81,7 @@ pub trait ExprEvalExtension: IntoExpr + Sized { .map(|len| { let s = c.slice(0, len); if (len - s.null_count()) >= min_periods { - let df = c.into_frame(); + let df = c.clone().into_frame(); let out = phys_expr.evaluate(&df, &state)?.into_column(); finish(out) } else { From 487d417c50c5f1c46cf6c330d7ae66db3fe81cd1 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Thu, 12 Sep 2024 12:01:39 +0200 Subject: [PATCH 27/42] fix clippy --- crates/polars-core/src/series/any_value.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index 3e9310427034..65336b100fa1 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -356,8 +356,8 @@ fn any_values_to_binary_offset( let mut builder = BinaryOffsetChunkedBuilder::new(PlSmallStr::EMPTY, values.len()); for av in values { match av { - AnyValue::Binary(s) => builder.append_value(*s), - AnyValue::BinaryOwned(s) => builder.append_value(&**s), + AnyValue::Binary(s) => builder.append_value(s), + AnyValue::BinaryOwned(s) => builder.append_value(s.as_slice()), AnyValue::Null => builder.append_null(), av => return Err(invalid_value_error(&DataType::Binary, av)), } From 2d4a0dade0fac28817505e64017c335e7995df2f Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Thu, 12 Sep 2024 21:26:14 +0200 Subject: [PATCH 28/42] start on a lot of the column optimizations --- .../chunked_array/builder/binary_offset.rs | 51 - .../src/chunked_array/builder/mod.rs | 2 - crates/polars-core/src/datatypes/any_value.rs | 10 + crates/polars-core/src/frame/column.rs | 1187 ----------------- crates/polars-core/src/lib.rs | 5 + crates/polars-core/src/series/any_value.rs | 39 - crates/polars-core/src/series/mod.rs | 13 +- .../src/executors/group_by_rolling.rs | 10 +- .../polars-ops/src/frame/join/asof/groups.rs | 4 +- .../src/dsl/function_expr/shift_and_fill.rs | 1 + crates/polars-utils/src/index.rs | 17 +- 11 files changed, 30 insertions(+), 1309 deletions(-) delete mode 100644 crates/polars-core/src/chunked_array/builder/binary_offset.rs delete mode 100644 crates/polars-core/src/frame/column.rs diff --git a/crates/polars-core/src/chunked_array/builder/binary_offset.rs b/crates/polars-core/src/chunked_array/builder/binary_offset.rs deleted file mode 100644 index c7d084294528..000000000000 --- a/crates/polars-core/src/chunked_array/builder/binary_offset.rs +++ /dev/null @@ -1,51 +0,0 @@ -use super::*; - -pub struct BinaryOffsetChunkedBuilder { - pub(crate) chunk_builder: MutableBinaryArray, - pub(crate) field: FieldRef, -} - -impl Clone for BinaryOffsetChunkedBuilder { - fn clone(&self) -> Self { - Self { - chunk_builder: self.chunk_builder.clone(), - field: self.field.clone(), - } - } -} - -impl BinaryOffsetChunkedBuilder { - /// Create a new [`BinaryOffsetChunkedBuilder`] - /// - /// # Arguments - /// - /// * `capacity` - Number of string elements in the final array. - pub fn new(name: PlSmallStr, capacity: usize) -> Self { - Self { - chunk_builder: MutableBinaryArray::with_capacity(capacity), - field: Arc::new(Field::new(name, DataType::BinaryOffset)), - } - } - - /// Appends a value of type `T` into the builder - #[inline] - pub fn append_value(&mut self, v: &[u8]) { - self.chunk_builder.push(Some(v)); - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) { - self.chunk_builder.push_null() - } - - #[inline] - pub fn append_option(&mut self, opt: Option<&[u8]>) { - self.chunk_builder.push(opt); - } - - pub fn finish(mut self) -> BinaryOffsetChunked { - let arr = self.chunk_builder.as_box(); - ChunkedArray::new_with_compute_len(self.field, vec![arr]) - } -} diff --git a/crates/polars-core/src/chunked_array/builder/mod.rs b/crates/polars-core/src/chunked_array/builder/mod.rs index e818254d581b..539586c2193e 100644 --- a/crates/polars-core/src/chunked_array/builder/mod.rs +++ b/crates/polars-core/src/chunked_array/builder/mod.rs @@ -1,4 +1,3 @@ -mod binary_offset; mod boolean; #[cfg(feature = "dtype-array")] pub mod fixed_size_list; @@ -11,7 +10,6 @@ use std::sync::Arc; use arrow::array::*; use arrow::bitmap::Bitmap; -pub use binary_offset::*; pub use boolean::*; #[cfg(feature = "dtype-array")] pub(crate) use fixed_size_list::*; diff --git a/crates/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs index 81adc0d7f5d2..43ca425796df 100644 --- a/crates/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -693,6 +693,16 @@ impl<'a> AnyValue<'a> { None => AnyValue::Null, } } + + pub fn idx(&self) -> IdxSize { + match self { + #[cfg(not(feature = "bigidx"))] + Self::UInt32(v) => *v, + #[cfg(feature = "bigidx")] + Self::UInt64(v) => *v, + _ => panic!("expected index type found {self:?}"), + } + } } impl From> for DataType { diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs deleted file mode 100644 index 8f5b87d9c9da..000000000000 --- a/crates/polars-core/src/frame/column.rs +++ /dev/null @@ -1,1187 +0,0 @@ -use std::borrow::Cow; -use std::ops::{Add, Div, Mul, Rem, Sub}; -use std::sync::OnceLock; - -use num_traits::{Num, NumCast}; -use polars_error::PolarsResult; -use polars_utils::pl_str::PlSmallStr; - -use crate::chunked_array::metadata::MetadataFlags; -use crate::prelude::*; -use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; -use crate::utils::Container; - -/// A column within a [`DataFrame`]. -/// -/// This is lazily initialized to a [`Series`] with methods like -/// [`as_materialized_series`][Column::as_materialized_series] and -/// [`take_materialized_series`][Column::take_materialized_series]. -/// -/// Currently, there are two ways to represent a [`Column`]. -/// 1. A [`Series`] of values -/// 2. A [`ScalarColumn`] that repeats a single [`Scalar`] -#[derive(Debug, Clone)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] -#[cfg_attr(feature = "serde", serde(from = "Series"))] -#[cfg_attr(feature = "serde", serde(into = "_SerdeSeries"))] -pub enum Column { - Series(Series), - Scalar(ScalarColumn), -} - -/// A [`Column`] that consists of a repeated [`Scalar`] -/// -/// This is lazily materialized into a [`Series`]. -#[derive(Debug, Clone)] -pub struct ScalarColumn { - name: PlSmallStr, - value: Scalar, - length: usize, - - // invariants: - // materialized.name() == name - // materialized.len() == length - // materialized.dtype() == value.dtype - // materialized[i] == value, for all 0 <= i < length - /// A lazily materialized [`Series`] variant of this [`ScalarColumn`] - materialized: OnceLock, -} - -/// Convert `Self` into a [`Column`] -pub trait IntoColumn: Sized { - fn into_column(self) -> Column; -} - -impl Column { - #[inline] - pub fn new(name: PlSmallStr, values: T) -> Self - where - Phantom: ?Sized, - Series: NamedFrom, - { - Self::Series(NamedFrom::new(name, values)) - } - - #[inline] - pub fn new_empty(name: PlSmallStr, dtype: &DataType) -> Self { - Self::new_scalar(name, Scalar::new(dtype.clone(), AnyValue::Null), 0) - } - - #[inline] - pub fn new_scalar(name: PlSmallStr, value: Scalar, length: usize) -> Self { - Self::Scalar(ScalarColumn::new(name, value, length)) - } - - // # Materialize - /// Get a reference to a [`Series`] for this [`Column`] - /// - /// This may need to materialize the [`Series`] on the first invocation for a specific column. - #[inline] - pub fn as_materialized_series(&self) -> &Series { - match self { - Column::Series(s) => s, - Column::Scalar(s) => s.as_materialized_series(), - } - } - /// Turn [`Column`] into a [`Column::Series`]. - /// - /// This may need to materialize the [`Series`] on the first invocation for a specific column. - #[inline] - pub fn into_materialized_series(&mut self) -> &mut Series { - match self { - Column::Series(s) => s, - Column::Scalar(s) => { - *self = Column::Series(s.to_series()); - let Column::Series(s) = self else { - unreachable!(); - }; - s - }, - } - } - /// Take [`Series`] from a [`Column`] - /// - /// This may need to materialize the [`Series`] on the first invocation for a specific column. - #[inline] - pub fn take_materialized_series(self) -> Series { - match self { - Column::Series(s) => s, - Column::Scalar(s) => s.take_materialized_series(), - } - } - - #[inline] - pub fn dtype(&self) -> &DataType { - match self { - Column::Series(s) => s.dtype(), - Column::Scalar(s) => s.value.dtype(), - } - } - - #[inline] - pub fn field(&self) -> Cow { - match self { - Column::Series(s) => s.field(), - Column::Scalar(s) => match s.materialized.get() { - None => Cow::Owned(Field::new(s.name.clone(), s.value.dtype().clone())), - Some(s) => s.field(), - }, - } - } - - #[inline] - pub fn name(&self) -> &PlSmallStr { - match self { - Column::Series(s) => s.name(), - Column::Scalar(s) => &s.name, - } - } - - #[inline] - pub fn len(&self) -> usize { - match self { - Column::Series(s) => s.len(), - Column::Scalar(s) => s.length, - } - } - - #[inline] - pub fn with_name(mut self, name: PlSmallStr) -> Column { - self.rename(name); - self - } - - #[inline] - pub fn rename(&mut self, name: PlSmallStr) { - match self { - Column::Series(s) => _ = s.rename(name), - Column::Scalar(s) => { - if let Some(series) = s.materialized.get_mut() { - series.rename(name.clone()); - } - - s.name = name; - }, - } - } - - // # Downcasting - #[inline] - pub fn as_series(&self) -> Option<&Series> { - match self { - Column::Series(s) => Some(s), - Column::Scalar(_) => None, - } - } - #[inline] - pub fn as_scalar_column(&self) -> Option<&ScalarColumn> { - match self { - Column::Series(_) => None, - Column::Scalar(s) => Some(s), - } - } - - // # To Chunked Arrays - pub fn bool(&self) -> PolarsResult<&BooleanChunked> { - // @scalar-opt - self.as_materialized_series().bool() - } - pub fn i8(&self) -> PolarsResult<&Int8Chunked> { - // @scalar-opt - self.as_materialized_series().i8() - } - pub fn i16(&self) -> PolarsResult<&Int16Chunked> { - // @scalar-opt - self.as_materialized_series().i16() - } - pub fn i32(&self) -> PolarsResult<&Int32Chunked> { - // @scalar-opt - self.as_materialized_series().i32() - } - pub fn i64(&self) -> PolarsResult<&Int64Chunked> { - // @scalar-opt - self.as_materialized_series().i64() - } - pub fn u8(&self) -> PolarsResult<&UInt8Chunked> { - // @scalar-opt - self.as_materialized_series().u8() - } - pub fn u16(&self) -> PolarsResult<&UInt16Chunked> { - // @scalar-opt - self.as_materialized_series().u16() - } - pub fn u32(&self) -> PolarsResult<&UInt32Chunked> { - // @scalar-opt - self.as_materialized_series().u32() - } - pub fn u64(&self) -> PolarsResult<&UInt64Chunked> { - // @scalar-opt - self.as_materialized_series().u64() - } - pub fn f32(&self) -> PolarsResult<&Float32Chunked> { - // @scalar-opt - self.as_materialized_series().f32() - } - pub fn f64(&self) -> PolarsResult<&Float64Chunked> { - // @scalar-opt - self.as_materialized_series().f64() - } - pub fn str(&self) -> PolarsResult<&StringChunked> { - // @scalar-opt - self.as_materialized_series().str() - } - pub fn list(&self) -> PolarsResult<&ListChunked> { - // @scalar-opt - self.as_materialized_series().list() - } - pub fn binary(&self) -> PolarsResult<&BinaryChunked> { - // @scalar-opt - self.as_materialized_series().binary() - } - pub fn idx(&self) -> PolarsResult<&IdxCa> { - // @scalar-opt - self.as_materialized_series().idx() - } - #[cfg(feature = "dtype-datetime")] - pub fn datetime(&self) -> PolarsResult<&DatetimeChunked> { - // @scalar-opt - self.as_materialized_series().datetime() - } - #[cfg(feature = "dtype-struct")] - pub fn struct_(&self) -> PolarsResult<&StructChunked> { - // @scalar-opt - self.as_materialized_series().struct_() - } - #[cfg(feature = "dtype-decimal")] - pub fn decimal(&self) -> PolarsResult<&DecimalChunked> { - // @scalar-opt - self.as_materialized_series().decimal() - } - #[cfg(feature = "dtype-array")] - pub fn array(&self) -> PolarsResult<&ArrayChunked> { - // @scalar-opt - self.as_materialized_series().array() - } - #[cfg(feature = "dtype-categorical")] - pub fn categorical(&self) -> PolarsResult<&CategoricalChunked> { - self.as_materialized_series().categorical() - } - - // # Casting - pub fn strict_cast(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .strict_cast(dtype) - .map(Column::from) - } - pub fn cast(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().cast(dtype).map(Column::from) - } - /// # Safety - /// - /// This can lead to invalid memory access in downstream code. - pub unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - unsafe { self.as_materialized_series().cast_unchecked(dtype) }.map(Column::from) - } - - pub fn clear(&self) -> Self { - match self { - Column::Series(s) => s.clear().into(), - Column::Scalar(s) => Self::new_scalar(s.name.clone(), s.value.clone(), 0), - } - } - - #[inline] - pub fn shrink_to_fit(&mut self) { - match self { - Column::Series(s) => s.shrink_to_fit(), - Column::Scalar(_) => {}, - } - } - - #[inline] - pub fn new_from_index(&self, index: usize, length: usize) -> Self { - // @scalar-opt - Self::Series(self.as_materialized_series().new_from_index(index, length)) - } - - pub fn has_nulls(&self) -> bool { - // @scalar-opt - self.as_materialized_series().has_nulls() - } - - pub fn is_not_null(&self) -> ChunkedArray { - // @scalar-opt - self.as_materialized_series().is_not_null() - } - - pub fn to_physical_repr(&self) -> Column { - // @scalar-opt - self.as_materialized_series() - .to_physical_repr() - .into_owned() - .into() - } - - pub fn head(&self, length: Option) -> Column { - // @scalar-opt - self.as_materialized_series().head(length).into() - } - - pub fn tail(&self, length: Option) -> Column { - // @scalar-opt - self.as_materialized_series().tail(length).into() - } - - pub fn slice(&self, offset: i64, length: usize) -> Column { - // @scalar-opt - self.as_materialized_series().slice(offset, length).into() - } - - pub fn split_at(&self, offset: i64) -> (Column, Column) { - // @scalar-opt - let (l, r) = self.as_materialized_series().split_at(offset); - (l.into(), r.into()) - } - - pub fn null_count(&self) -> usize { - // @scalar-opt - self.as_materialized_series().null_count() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_min(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_min(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_max(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_max(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_mean(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_mean(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_sum(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_sum(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - pub unsafe fn agg_first(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_first(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - pub unsafe fn agg_last(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_last(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - pub unsafe fn agg_n_unique(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_n_unique(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - pub unsafe fn agg_quantile( - &self, - groups: &GroupsProxy, - quantile: f64, - interpol: QuantileInterpolOptions, - ) -> Self { - // @scalar-opt - unsafe { - self.as_materialized_series() - .agg_quantile(groups, quantile, interpol) - } - .into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_median(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_median(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_var(&self, groups: &GroupsProxy, ddof: u8) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_var(groups, ddof) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub(crate) unsafe fn agg_std(&self, groups: &GroupsProxy, ddof: u8) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_std(groups, ddof) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_list(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_list(groups) }.into() - } - - pub fn full_null(name: PlSmallStr, size: usize, dtype: &DataType) -> Column { - // @scalar-opt - Series::full_null(name, size, dtype).into() - } - - pub fn is_empty(&self) -> bool { - // @scalar-opt - self.as_materialized_series().is_empty() - } - - pub fn reverse(&self) -> Column { - // @scalar-opt - self.as_materialized_series().reverse().into() - } - - pub fn equals(&self, right: &Column) -> bool { - // @scalar-opt - self.as_materialized_series() - .equals(right.as_materialized_series()) - } - - pub fn equals_missing(&self, right: &Column) -> bool { - // @scalar-opt - self.as_materialized_series() - .equals_missing(right.as_materialized_series()) - } - - pub fn set_sorted_flag(&mut self, sorted: IsSorted) { - // @scalar-opt - match self { - Column::Series(s) => s.set_sorted_flag(sorted), - Column::Scalar(_) => {}, - } - } - - pub fn get_flags(&self) -> MetadataFlags { - match self { - Column::Series(s) => s.get_flags(), - // @scalar-opt - Column::Scalar(_) => MetadataFlags::empty(), - } - } - - pub fn get_data_ptr(&self) -> usize { - // @scalar-opt - self.as_materialized_series().get_data_ptr() - } - - pub fn vec_hash(&self, build_hasher: PlRandomState, buf: &mut Vec) -> PolarsResult<()> { - // @scalar-opt? - self.as_materialized_series().vec_hash(build_hasher, buf) - } - - pub fn vec_hash_combine( - &self, - build_hasher: PlRandomState, - hashes: &mut [u64], - ) -> PolarsResult<()> { - // @scalar-opt? - self.as_materialized_series() - .vec_hash_combine(build_hasher, hashes) - } - - /// # Safety - /// - /// Indexes need to be in bounds. - pub(crate) unsafe fn equal_element( - &self, - idx_self: usize, - idx_other: usize, - other: &Column, - ) -> bool { - // @scalar-opt - unsafe { - self.as_materialized_series().equal_element( - idx_self, - idx_other, - other.as_materialized_series(), - ) - } - } - - pub fn append(&mut self, other: &Column) -> PolarsResult<&mut Self> { - // @scalar-opt - self.into_materialized_series() - .append(other.as_materialized_series())?; - Ok(self) - } - - pub fn arg_sort(&self, options: SortOptions) -> IdxCa { - // @scalar-opt - self.as_materialized_series().arg_sort(options) - } - - pub fn bit_repr(&self) -> Option { - // @scalar-opt - self.as_materialized_series().bit_repr() - } - - pub fn into_frame(self) -> DataFrame { - // SAFETY: A single-column dataframe cannot have length mismatches or duplicate names - unsafe { DataFrame::new_no_checks(vec![self]) } - } - - pub fn unique_stable(&self) -> PolarsResult { - // @scalar-opt? - self.as_materialized_series() - .unique_stable() - .map(Column::from) - } - - pub fn extend(&mut self, other: &Column) -> PolarsResult<&mut Self> { - // @scalar-opt - self.into_materialized_series() - .extend(other.as_materialized_series())?; - Ok(self) - } - - pub fn rechunk(&self) -> Column { - match self { - Column::Series(s) => s.rechunk().into(), - Column::Scalar(_) => self.clone(), - } - } - - pub fn explode(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().explode().map(Column::from) - } - - pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .fill_null(strategy) - .map(Column::from) - } - - pub fn divide(&self, rhs: &Column) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .divide(rhs.as_materialized_series()) - .map(Column::from) - } - - pub fn shift(&self, periods: i64) -> Column { - // @scalar-opt - self.as_materialized_series().shift(periods).into() - } - - #[cfg(feature = "zip_with")] - pub fn zip_with_same_type( - &self, - mask: &ChunkedArray, - other: &Column, - ) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .zip_with_same_type(mask, other.as_materialized_series()) - .map(Column::from) - } - - pub fn drop_nulls(&self) -> Column { - // @scalar-opt - self.as_materialized_series().drop_nulls().into() - } - - pub fn is_sorted_flag(&self) -> IsSorted { - // @scalar-opt - self.as_materialized_series().is_sorted_flag() - } - - pub fn get(&self, index: usize) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().get(index) - } - - pub fn unique(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().unique().map(Column::from) - } - - pub fn reshape_list(&self, dimensions: &[i64]) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .reshape_list(dimensions) - .map(Self::from) - } - - #[cfg(feature = "dtype-array")] - pub fn reshape_array(&self, dimensions: &[i64]) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .reshape_array(dimensions) - .map(Self::from) - } - - pub fn sort(&self, sort_options: SortOptions) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .sort(sort_options) - .map(Self::from) - } - - pub fn filter(&self, filter: &ChunkedArray) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().filter(filter).map(Self::from) - } - - #[cfg(feature = "random")] - pub fn shuffle(&self, seed: Option) -> Self { - // @scalar-opt - self.as_materialized_series().shuffle(seed).into() - } - - #[cfg(feature = "random")] - pub fn sample_frac( - &self, - frac: f64, - with_replacement: bool, - shuffle: bool, - seed: Option, - ) -> PolarsResult { - self.as_materialized_series() - .sample_frac(frac, with_replacement, shuffle, seed) - .map(Self::from) - } - - #[cfg(feature = "random")] - pub fn sample_n( - &self, - n: usize, - with_replacement: bool, - shuffle: bool, - seed: Option, - ) -> PolarsResult { - self.as_materialized_series() - .sample_n(n, with_replacement, shuffle, seed) - .map(Self::from) - } - - pub fn gather_every(&self, n: usize, offset: usize) -> Column { - // @scalar-opt - self.as_materialized_series().gather_every(n, offset).into() - } - - pub fn extend_constant(&self, value: AnyValue, n: usize) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .extend_constant(value, n) - .map(Self::from) - } - - pub fn is_null(&self) -> BooleanChunked { - // @scalar-opt - self.as_materialized_series().is_null() - } - - #[cfg(feature = "zip_with")] - pub fn zip_with(&self, mask: &BooleanChunked, other: &Self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .zip_with(mask, other.as_materialized_series()) - .map(Self::from) - } - - pub fn is_finite(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_finite() - } - - pub fn is_infinite(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_infinite() - } - - pub fn is_nan(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_nan() - } - - pub fn is_not_nan(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_not_nan() - } - - #[cfg(feature = "dtype-date")] - pub fn date(&self) -> PolarsResult<&DateChunked> { - // @scalar-opt - self.as_materialized_series().date() - } - - #[cfg(feature = "dtype-duration")] - pub fn duration(&self) -> PolarsResult<&DurationChunked> { - // @scalar-opt - self.as_materialized_series().duration() - } - - pub fn wrapping_trunc_div_scalar(&self, rhs: T) -> Self - where - T: Num + NumCast, - { - // @scalar-opt - self.as_materialized_series() - .wrapping_trunc_div_scalar(rhs) - .into() - } - - pub fn product(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().product() - } - - pub fn binary_offset(&self) -> PolarsResult<&BinaryOffsetChunked> { - // @scalar-opt - self.as_materialized_series().binary_offset() - } - - pub fn phys_iter(&self) -> SeriesPhysIter<'_> { - // @scalar-opt - self.as_materialized_series().phys_iter() - } - - /// # Safety - /// - /// Does not perform bounds check on `index` - pub unsafe fn get_unchecked(&self, index: usize) -> AnyValue { - // @scalar-opt - self.as_materialized_series().get_unchecked(index) - } - - #[cfg(feature = "object")] - pub fn get_object( - &self, - index: usize, - ) -> Option<&dyn crate::chunked_array::object::PolarsObjectSafe> { - self.as_materialized_series().get_object(index) - } - - pub fn bitand(&self, rhs: &Self) -> PolarsResult { - self.as_materialized_series() - .bitand(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl ChunkCompare<&Column> for Column { - type Item = PolarsResult; - - /// Create a boolean mask by checking for equality. - #[inline] - fn equal(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .equal(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking for equality. - #[inline] - fn equal_missing(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .equal_missing(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking for inequality. - #[inline] - fn not_equal(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .not_equal(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking for inequality. - #[inline] - fn not_equal_missing(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .not_equal_missing(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking if self > rhs. - #[inline] - fn gt(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .gt(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking if self >= rhs. - #[inline] - fn gt_eq(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .gt_eq(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking if self < rhs. - #[inline] - fn lt(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .lt(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking if self <= rhs. - #[inline] - fn lt_eq(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .lt_eq(rhs.as_materialized_series()) - } -} - -impl Default for Column { - fn default() -> Self { - // @scalar-opt - Column::Series(Series::default()) - } -} - -impl PartialEq for Column { - fn eq(&self, other: &Self) -> bool { - // @scalar-opt - self.as_materialized_series() - .eq(other.as_materialized_series()) - } -} - -impl From for Column { - #[inline] - fn from(value: Series) -> Self { - Self::Series(value) - } -} - -impl From for Column { - #[inline] - fn from(value: ScalarColumn) -> Self { - Self::Scalar(value) - } -} - -impl Add for Column { - type Output = PolarsResult; - - fn add(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .add(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Add for &Column { - type Output = PolarsResult; - - fn add(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .add(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Sub for Column { - type Output = PolarsResult; - - fn sub(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .sub(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Sub for &Column { - type Output = PolarsResult; - - fn sub(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .sub(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Mul for Column { - type Output = PolarsResult; - - fn mul(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .mul(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Mul for &Column { - type Output = PolarsResult; - - fn mul(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .mul(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Sub for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn sub(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().sub(rhs).into() - } -} - -impl Sub for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn sub(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().sub(rhs).into() - } -} - -impl Add for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn add(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().add(rhs).into() - } -} - -impl Add for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn add(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().add(rhs).into() - } -} - -impl Div for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn div(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().div(rhs).into() - } -} - -impl Div for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn div(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().div(rhs).into() - } -} - -impl Mul for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn mul(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().mul(rhs).into() - } -} - -impl Mul for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn mul(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().mul(rhs).into() - } -} - -impl Rem for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn rem(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().rem(rhs).into() - } -} - -impl Rem for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn rem(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().rem(rhs).into() - } -} - -impl ScalarColumn { - #[inline] - pub fn new(name: PlSmallStr, value: Scalar, length: usize) -> Self { - Self { - name, - value, - length, - - materialized: OnceLock::new(), - } - } - - fn _to_series(name: PlSmallStr, value: Scalar, length: usize) -> Series { - let series = if length == 0 { - Series::new_empty(name, value.dtype()) - } else { - // @TODO: There is probably a better way to do this. - value.into_series(name).new_from_index(0, length) - }; - - debug_assert_eq!(series.len(), length); - - series - } - - pub fn to_series(&self) -> Series { - Self::_to_series(self.name.clone(), self.value.clone(), self.length) - } - - /// Get the [`ScalarColumn`] as [`Series`] - /// - /// This needs to materialize upon the first call. Afterwards, this is cached. - pub fn as_materialized_series(&self) -> &Series { - self.materialized.get_or_init(|| self.to_series()) - } - - /// Take the [`ScalarColumn`] and materialize as a [`Series`] if not already done. - pub fn take_materialized_series(self) -> Series { - self.materialized - .into_inner() - .unwrap_or_else(|| Self::_to_series(self.name, self.value, self.length)) - } -} - -impl IntoColumn for T { - #[inline] - fn into_column(self) -> Column { - let series = self.into_series(); - - if series.len() == 1 { - // SAFETY: We just did the bounds check - let value = unsafe { series.get_unchecked(0) }; - - if let Ok(value) = value.into_static() { - let value = Scalar::new(series.dtype().clone(), value); - let mut col = ScalarColumn::new(series.name().clone(), value, 1); - col.materialized = OnceLock::from(series); - return Column::Scalar(col); - } - } - - Column::Series(series) - } -} - -impl IntoColumn for Column { - #[inline(always)] - fn into_column(self) -> Column { - self - } -} - -impl IntoColumn for ScalarColumn { - #[inline(always)] - fn into_column(self) -> Column { - self.into() - } -} - -/// We don't want to serialize the scalar columns. So this helps pretend that columns are always -/// initialized without implementing From for Series. -/// -/// Those casts should be explicit. -#[derive(Clone)] -#[cfg_attr(feature = "serde", derive(serde::Serialize))] -#[cfg_attr(feature = "serde", serde(into = "Series"))] -struct _SerdeSeries(Series); - -impl From for _SerdeSeries { - #[inline] - fn from(value: Column) -> Self { - Self(value.take_materialized_series()) - } -} - -impl From<_SerdeSeries> for Series { - #[inline] - fn from(value: _SerdeSeries) -> Self { - value.0 - } -} diff --git a/crates/polars-core/src/lib.rs b/crates/polars-core/src/lib.rs index 117f462619dc..a7e74b230410 100644 --- a/crates/polars-core/src/lib.rs +++ b/crates/polars-core/src/lib.rs @@ -69,3 +69,8 @@ pub static POOL: Lazy = Lazy::new(|| polars_utils::was // utility for the tests to ensure a single thread can execute pub static SINGLE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); + +/// Default length for a `.head()` call +pub(crate) const HEAD_DEFAULT_LENGTH: usize = 10; +/// Default length for a `.tail()` call +pub(crate) const TAIL_DEFAULT_LENGTH: usize = 10; diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index 65336b100fa1..aaa4bc753443 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -2,7 +2,6 @@ use std::fmt::Write; use arrow::bitmap::MutableBitmap; -use crate::chunked_array::builder::BinaryOffsetChunkedBuilder; #[cfg(feature = "dtype-categorical")] use crate::chunked_array::cast::CastOptions; #[cfg(feature = "object")] @@ -164,7 +163,6 @@ impl Series { #[cfg(feature = "object")] DataType::Object(_, registry) => any_values_to_object(values, registry)?, DataType::Null => Series::new_null(PlSmallStr::EMPTY, values.len()), - DataType::BinaryOffset => any_values_to_binary_offset(values, strict)?.into_series(), dt => { polars_bail!( InvalidOperation: @@ -346,43 +344,6 @@ fn any_values_to_binary(values: &[AnyValue], strict: bool) -> PolarsResult PolarsResult { - fn any_values_to_binary_offset_strict( - values: &[AnyValue], - ) -> PolarsResult { - let mut builder = BinaryOffsetChunkedBuilder::new(PlSmallStr::EMPTY, values.len()); - for av in values { - match av { - AnyValue::Binary(s) => builder.append_value(s), - AnyValue::BinaryOwned(s) => builder.append_value(s.as_slice()), - AnyValue::Null => builder.append_null(), - av => return Err(invalid_value_error(&DataType::Binary, av)), - } - } - Ok(builder.finish()) - } - fn any_values_to_binary_offset_nonstrict(values: &[AnyValue]) -> BinaryOffsetChunked { - values - .iter() - .map(|av| match av { - AnyValue::Binary(b) => Some(*b), - AnyValue::BinaryOwned(b) => Some(&**b), - AnyValue::String(s) => Some(s.as_bytes()), - AnyValue::StringOwned(s) => Some(s.as_str().as_bytes()), - _ => None, - }) - .collect_trusted() - } - if strict { - any_values_to_binary_offset_strict(values) - } else { - Ok(any_values_to_binary_offset_nonstrict(values)) - } -} - #[cfg(feature = "dtype-date")] fn any_values_to_date(values: &[AnyValue], strict: bool) -> PolarsResult { let mut builder = PrimitiveChunkedBuilder::::new(PlSmallStr::EMPTY, values.len()); diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 2a18d7c7a6e9..cb4a9bb84030 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -1,6 +1,7 @@ //! Type agnostic columnar data structure. pub use crate::prelude::ChunkCompare; use crate::prelude::*; +use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH}; pub mod amortized_iter; mod any_value; @@ -817,18 +818,14 @@ impl Series { } /// Get the head of the Series. pub fn head(&self, length: Option) -> Series { - match length { - Some(len) => self.slice(0, std::cmp::min(len, self.len())), - None => self.slice(0, std::cmp::min(10, self.len())), - } + let len = length.unwrap_or(HEAD_DEFAULT_LENGTH); + self.slice(0, std::cmp::min(len, self.len())) } /// Get the tail of the Series. pub fn tail(&self, length: Option) -> Series { - let len = match length { - Some(len) => std::cmp::min(len, self.len()), - None => std::cmp::min(10, self.len()), - }; + let len = length.unwrap_or(TAIL_DEFAULT_LENGTH); + let len = std::cmp::min(len, self.len()); self.slice(-(len as i64), len) } diff --git a/crates/polars-mem-engine/src/executors/group_by_rolling.rs b/crates/polars-mem-engine/src/executors/group_by_rolling.rs index 50ad9da7fef2..3e84740ea92d 100644 --- a/crates/polars-mem-engine/src/executors/group_by_rolling.rs +++ b/crates/polars-mem-engine/src/executors/group_by_rolling.rs @@ -21,10 +21,7 @@ unsafe fn update_keys(keys: &mut [Column], groups: &GroupsProxy) { // can be empty, but we still want to know the first value // of that group for key in keys.iter_mut() { - *key = key - .as_materialized_series() - .take_unchecked_from_slice(first) - .into_column(); + *key = key.take_slice_unchecked(first); } }, GroupsProxy::Slice { groups, .. } => { @@ -33,10 +30,7 @@ unsafe fn update_keys(keys: &mut [Column], groups: &GroupsProxy) { .iter() .map(|[first, _len]| *first) .collect_ca(PlSmallStr::EMPTY); - *key = key - .as_materialized_series() - .take_unchecked(&indices) - .into_column(); + *key = key.take_unchecked(&indices); } }, } diff --git a/crates/polars-ops/src/frame/join/asof/groups.rs b/crates/polars-ops/src/frame/join/asof/groups.rs index 6a7ccb3b76f8..9332b10e392b 100644 --- a/crates/polars-ops/src/frame/join/asof/groups.rs +++ b/crates/polars-ops/src/frame/join/asof/groups.rs @@ -32,8 +32,8 @@ pub(crate) unsafe fn compare_df_rows2( join_nulls: bool, ) -> bool { for (l, r) in left.get_columns().iter().zip(right.get_columns()) { - let l = l.as_materialized_series().get_unchecked(left_idx); - let r = r.as_materialized_series().get_unchecked(right_idx); + let l = l.get_unchecked(left_idx); + let r = r.get_unchecked(right_idx); if !l.eq_missing(&r, join_nulls) { return false; } diff --git a/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs index a1ecc4a12d02..4dafb71643bf 100644 --- a/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs +++ b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs @@ -72,6 +72,7 @@ pub(super) fn shift_and_fill(args: &[Column]) -> PolarsResult { let ca = s.str()?; let fill_value = match fill_value { AnyValue::String(v) => Some(v), + AnyValue::StringOwned(ref v) => Some(v.as_str()), AnyValue::Null => None, v => polars_bail!(ComputeError: "fill value '{}' is not supported", v), }; diff --git a/crates/polars-utils/src/index.rs b/crates/polars-utils/src/index.rs index 1ca29d394727..f21ba1b39284 100644 --- a/crates/polars-utils/src/index.rs +++ b/crates/polars-utils/src/index.rs @@ -127,18 +127,11 @@ impl Indexable for &[T] { pub fn check_bounds(idx: &[IdxSize], len: IdxSize) -> PolarsResult<()> { // We iterate in large uninterrupted chunks to help auto-vectorization. - let mut in_bounds = true; - for chunk in idx.chunks(1024) { - for i in chunk { - if *i >= len { - in_bounds = false; - } - } - if !in_bounds { - break; - } - } - polars_ensure!(in_bounds, OutOfBounds: "indices are out of bounds"); + let Some(max_idx) = idx.iter().copied().max() else { + return Ok(()); + }; + + polars_ensure!(max_idx < len, OutOfBounds: "indices are out of bounds"); Ok(()) } From e797f70017094590d47324e36e7b20174728e8b1 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 11:08:09 +0200 Subject: [PATCH 29/42] scalar-opt StructArray::from_series --- crates/polars-core/src/chunked_array/cast.rs | 2 +- .../polars-core/src/chunked_array/ops/full.rs | 4 +- .../src/chunked_array/ops/sort/mod.rs | 2 +- .../polars-core/src/chunked_array/ops/zip.rs | 2 +- .../src/chunked_array/struct_/frame.rs | 4 +- .../src/chunked_array/struct_/mod.rs | 51 ++++++++-------- crates/polars-core/src/frame/row/av_buffer.rs | 2 +- crates/polars-core/src/serde/series.rs | 2 +- crates/polars-core/src/series/any_value.rs | 2 +- crates/polars-core/src/series/mod.rs | 2 +- crates/polars-core/src/series/ops/null.rs | 2 +- .../src/expressions/aggregation.rs | 2 +- .../src/chunked_array/array/to_struct.rs | 2 +- crates/polars-ops/src/chunked_array/hist.rs | 2 +- .../src/chunked_array/list/to_struct.rs | 2 +- .../src/chunked_array/strings/extract.rs | 2 +- .../src/chunked_array/strings/json_path.rs | 4 +- .../src/chunked_array/strings/split.rs | 2 +- .../polars-ops/src/frame/join/merge_sorted.rs | 2 +- crates/polars-ops/src/frame/pivot/mod.rs | 12 +--- .../polars-ops/src/frame/pivot/positioning.rs | 7 +-- crates/polars-ops/src/series/ops/cut.rs | 4 +- .../src/dsl/function_expr/struct_.rs | 8 +-- crates/polars-plan/src/dsl/name.rs | 2 +- crates/polars-python/src/map/mod.rs | 2 +- .../nodes/parquet_source/row_group_decode.rs | 60 +++++++++++-------- 26 files changed, 92 insertions(+), 96 deletions(-) diff --git a/crates/polars-core/src/chunked_array/cast.rs b/crates/polars-core/src/chunked_array/cast.rs index 53f6e85f221d..ea758742169e 100644 --- a/crates/polars-core/src/chunked_array/cast.rs +++ b/crates/polars-core/src/chunked_array/cast.rs @@ -125,7 +125,7 @@ fn cast_single_to_struct( new_fields.push(Series::full_null(fld.name.clone(), length, &fld.dtype)); } - StructChunked::from_series(name, &new_fields).map(|ca| ca.into_series()) + StructChunked::from_series(name, new_fields.iter()).map(|ca| ca.into_series()) } impl ChunkedArray diff --git a/crates/polars-core/src/chunked_array/ops/full.rs b/crates/polars-core/src/chunked_array/ops/full.rs index 3f797d588e47..ee307cc3ca8e 100644 --- a/crates/polars-core/src/chunked_array/ops/full.rs +++ b/crates/polars-core/src/chunked_array/ops/full.rs @@ -192,8 +192,8 @@ impl ListChunked { #[cfg(feature = "dtype-struct")] impl ChunkFullNull for StructChunked { fn full_null(name: PlSmallStr, length: usize) -> StructChunked { - let s = vec![Series::new_null(PlSmallStr::EMPTY, length)]; - StructChunked::from_series(name, &s) + let s = [Series::new_null(PlSmallStr::EMPTY, length)]; + StructChunked::from_series(name, s.iter()) .unwrap() .with_outer_validity(Some(Bitmap::new_zeroed(length))) } diff --git a/crates/polars-core/src/chunked_array/ops/sort/mod.rs b/crates/polars-core/src/chunked_array/ops/sort/mod.rs index cfe30bb59c7d..0aa70dae1c83 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/mod.rs @@ -724,7 +724,7 @@ pub(crate) fn convert_sort_column_multi_sort(s: &Series) -> PolarsResult .iter() .map(convert_sort_column_multi_sort) .collect::>>()?; - let mut out = StructChunked::from_series(ca.name().clone(), &new_fields)?; + let mut out = StructChunked::from_series(ca.name().clone(), new_fields.iter())?; out.zip_outer_validity(ca); out.into_series() }, diff --git a/crates/polars-core/src/chunked_array/ops/zip.rs b/crates/polars-core/src/chunked_array/ops/zip.rs index eb24468d892d..7fe09ba2c7d1 100644 --- a/crates/polars-core/src/chunked_array/ops/zip.rs +++ b/crates/polars-core/src/chunked_array/ops/zip.rs @@ -237,7 +237,7 @@ impl ChunkZip for StructChunked { .map(|(lhs, rhs)| lhs.zip_with_same_type(&mask, &rhs)) .collect::>>()?; - let mut out = StructChunked::from_series(self.name().clone(), &fields)?; + let mut out = StructChunked::from_series(self.name().clone(), fields.iter())?; // Zip the validities. if (l.null_count + r.null_count) > 0 { diff --git a/crates/polars-core/src/chunked_array/struct_/frame.rs b/crates/polars-core/src/chunked_array/struct_/frame.rs index 92e46ac8635a..83f0f1299667 100644 --- a/crates/polars-core/src/chunked_array/struct_/frame.rs +++ b/crates/polars-core/src/chunked_array/struct_/frame.rs @@ -5,8 +5,6 @@ use crate::prelude::StructChunked; impl DataFrame { pub fn into_struct(self, name: PlSmallStr) -> StructChunked { - // @scalar-opt - let series = self.materialized_column_iter().cloned().collect::>(); - StructChunked::from_series(name, &series).expect("same invariants") + StructChunked::from_columns(name, &self.columns).expect("same invariants") } } diff --git a/crates/polars-core/src/chunked_array/struct_/mod.rs b/crates/polars-core/src/chunked_array/struct_/mod.rs index e635be7f8f13..0c4eb50ddc58 100644 --- a/crates/polars-core/src/chunked_array/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/struct_/mod.rs @@ -18,21 +18,24 @@ use crate::utils::Container; pub type StructChunked = ChunkedArray; -fn constructor(name: PlSmallStr, fields: &[Series]) -> PolarsResult { +fn constructor<'a, I: ExactSizeIterator + Clone>( + name: PlSmallStr, + fields: I, +) -> PolarsResult { // Different chunk lengths: rechunk and recurse. - if !fields.iter().map(|s| s.n_chunks()).all_equal() { - let fields = fields.iter().map(|s| s.rechunk()).collect::>(); - return constructor(name, &fields); + if !fields.clone().map(|s| s.n_chunks()).all_equal() { + let fields = fields.map(|s| s.rechunk()).collect::>(); + return constructor(name, fields.iter()); } - let n_chunks = fields[0].n_chunks(); - let dtype = DataType::Struct(fields.iter().map(|s| s.field().into_owned()).collect()); + let n_chunks = fields.clone().next().unwrap().n_chunks(); + let dtype = DataType::Struct(fields.clone().map(|s| s.field().into_owned()).collect()); let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest()); let chunks = (0..n_chunks) .map(|c_i| { let fields = fields - .iter() + .clone() .map(|field| field.chunks()[c_i].clone()) .collect::>(); @@ -55,30 +58,28 @@ fn constructor(name: PlSmallStr, fields: &[Series]) -> PolarsResult { - let fields = fields.iter().map(|s| s.rechunk()).collect::>(); - constructor(name, &fields) + let fields = fields.map(|s| s.rechunk()).collect::>(); + constructor(name, fields.iter()) }, } } impl StructChunked { pub fn from_columns(name: PlSmallStr, fields: &[Column]) -> PolarsResult { - // @scalar-opt! - let series = fields - .iter() - .map(|c| c.as_materialized_series().clone()) - .collect::>(); - Self::from_series(name, &series) + Self::from_series(name, fields.iter().map(|c| c.as_materialized_series())) } - pub fn from_series(name: PlSmallStr, fields: &[Series]) -> PolarsResult { + pub fn from_series<'a, I: ExactSizeIterator + Clone>( + name: PlSmallStr, + fields: I, + ) -> PolarsResult { let mut names = PlHashSet::with_capacity(fields.len()); - let first_len = fields.first().map(|s| s.len()).unwrap_or(0); + let first_len = fields.clone().next().map(|s| s.len()).unwrap_or(0); let mut max_len = first_len; let mut all_equal_len = true; let mut is_empty = false; - for s in fields { + for s in fields.clone() { let s_len = s.len(); max_len = std::cmp::max(max_len, s_len); @@ -117,10 +118,10 @@ impl StructChunked { ); } } - constructor(name, &new_fields) - } else if fields.is_empty() { - let fields = &[Series::new_null(PlSmallStr::EMPTY, 0)]; - constructor(name, fields) + constructor(name, new_fields.iter()) + } else if fields.len() == 0 { + let fields = [Series::new_null(PlSmallStr::EMPTY, 0)]; + constructor(name, fields.iter()) } else { constructor(name, fields) } @@ -184,7 +185,7 @@ impl StructChunked { }) .collect::>>()?; - let mut out = Self::from_series(self.name().clone(), &new_fields)?; + let mut out = Self::from_series(self.name().clone(), new_fields.iter())?; if self.null_count > 0 { out.zip_outer_validity(self); } @@ -240,7 +241,7 @@ impl StructChunked { } }) .collect::>>()?; - let mut out = Self::from_series(self.name().clone(), &fields)?; + let mut out = Self::from_series(self.name().clone(), fields.iter())?; if self.null_count > 0 { out.zip_outer_validity(self); } @@ -285,7 +286,7 @@ impl StructChunked { .iter() .map(func) .collect::>>()?; - Self::from_series(self.name().clone(), &fields).map(|mut ca| { + Self::from_series(self.name().clone(), fields.iter()).map(|mut ca| { if self.null_count > 0 { // SAFETY: we don't change types/ lengths. unsafe { diff --git a/crates/polars-core/src/frame/row/av_buffer.rs b/crates/polars-core/src/frame/row/av_buffer.rs index 608d6ec820af..f46332021ef1 100644 --- a/crates/polars-core/src/frame/row/av_buffer.rs +++ b/crates/polars-core/src/frame/row/av_buffer.rs @@ -624,7 +624,7 @@ impl<'a> AnyValueBufferTrusted<'a> { s }) .collect::>(); - StructChunked::from_series(PlSmallStr::EMPTY, &v) + StructChunked::from_series(PlSmallStr::EMPTY, v.iter()) .unwrap() .into_series() }, diff --git a/crates/polars-core/src/serde/series.rs b/crates/polars-core/src/serde/series.rs index 3506a0e9cc89..0ef07e702374 100644 --- a/crates/polars-core/src/serde/series.rs +++ b/crates/polars-core/src/serde/series.rs @@ -277,7 +277,7 @@ impl<'de> Deserialize<'de> for Series { #[cfg(feature = "dtype-struct")] DataType::Struct(_) => { let values: Vec = map.next_value()?; - let ca = StructChunked::from_series(name.clone(), &values).unwrap(); + let ca = StructChunked::from_series(name.clone(), values.iter()).unwrap(); let mut s = ca.into_series(); s.rename(name); Ok(s) diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index aaa4bc753443..30fba0a9cb14 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -743,7 +743,7 @@ fn any_values_to_struct( series_fields.push(s) } - let mut out = StructChunked::from_series(PlSmallStr::EMPTY, &series_fields)?; + let mut out = StructChunked::from_series(PlSmallStr::EMPTY, series_fields.iter())?; if has_outer_validity { let mut validity = MutableBitmap::new(); validity.extend_constant(values.len(), true); diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index cb4a9bb84030..bd46b4c6a3d8 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -630,7 +630,7 @@ impl Series { .iter() .map(|s| s.to_physical_repr().into_owned()) .collect(); - let mut ca = StructChunked::from_series(self.name().clone(), &fields).unwrap(); + let mut ca = StructChunked::from_series(self.name().clone(), fields.iter()).unwrap(); if arr.null_count() > 0 { ca.zip_outer_validity(arr); diff --git a/crates/polars-core/src/series/ops/null.rs b/crates/polars-core/src/series/ops/null.rs index ee33c309687e..edff23e5d31f 100644 --- a/crates/polars-core/src/series/ops/null.rs +++ b/crates/polars-core/src/series/ops/null.rs @@ -55,7 +55,7 @@ impl Series { .iter() .map(|fld| Series::full_null(fld.name().clone(), size, fld.dtype())) .collect::>(); - let ca = StructChunked::from_series(name, &fields).unwrap(); + let ca = StructChunked::from_series(name, fields.iter()).unwrap(); if !fields.is_empty() { ca.with_outer_validity(Some(Bitmap::new_zeroed(size))) diff --git a/crates/polars-expr/src/expressions/aggregation.rs b/crates/polars-expr/src/expressions/aggregation.rs index 297c77b19e00..8e2563e526e0 100644 --- a/crates/polars-expr/src/expressions/aggregation.rs +++ b/crates/polars-expr/src/expressions/aggregation.rs @@ -502,7 +502,7 @@ impl PartitionedAggregation for AggregationExpr { }; let mut count_s = series.agg_valid_count(groups); count_s.rename(PlSmallStr::from_static("__POLARS_COUNT")); - Ok(StructChunked::from_series(new_name, &[agg_s, count_s]) + Ok(StructChunked::from_series(new_name, [agg_s, count_s].iter()) .unwrap() .into_series()) } diff --git a/crates/polars-ops/src/chunked_array/array/to_struct.rs b/crates/polars-ops/src/chunked_array/array/to_struct.rs index b79a9ffcfe9f..b00dbbf4d43b 100644 --- a/crates/polars-ops/src/chunked_array/array/to_struct.rs +++ b/crates/polars-ops/src/chunked_array/array/to_struct.rs @@ -40,7 +40,7 @@ pub trait ToStruct: AsArray { .collect::>>() })?; - StructChunked::from_series(ca.name().clone(), &fields) + StructChunked::from_series(ca.name().clone(), fields.iter()) } } diff --git a/crates/polars-ops/src/chunked_array/hist.rs b/crates/polars-ops/src/chunked_array/hist.rs index 8d7781745531..ca906d12851c 100644 --- a/crates/polars-ops/src/chunked_array/hist.rs +++ b/crates/polars-ops/src/chunked_array/hist.rs @@ -136,7 +136,7 @@ where let out = fields.pop().unwrap(); out.with_name(ca.name().clone()) } else { - StructChunked::from_series(ca.name().clone(), &fields) + StructChunked::from_series(ca.name().clone(), fields.iter()) .unwrap() .into_series() } diff --git a/crates/polars-ops/src/chunked_array/list/to_struct.rs b/crates/polars-ops/src/chunked_array/list/to_struct.rs index 73798163ed48..cdd245bce8b7 100644 --- a/crates/polars-ops/src/chunked_array/list/to_struct.rs +++ b/crates/polars-ops/src/chunked_array/list/to_struct.rs @@ -80,7 +80,7 @@ pub trait ToStruct: AsList { .collect::>>() })?; - StructChunked::from_series(ca.name().clone(), &fields) + StructChunked::from_series(ca.name().clone(), fields.iter()) } } diff --git a/crates/polars-ops/src/chunked_array/strings/extract.rs b/crates/polars-ops/src/chunked_array/strings/extract.rs index 35f38e40d61d..cb26d66f7aff 100644 --- a/crates/polars-ops/src/chunked_array/strings/extract.rs +++ b/crates/polars-ops/src/chunked_array/strings/extract.rs @@ -50,7 +50,7 @@ pub(super) fn extract_groups( if n_fields == 1 { return StructChunked::from_series( ca.name().clone(), - &[Series::new_null(ca.name().clone(), ca.len())], + [Series::new_null(ca.name().clone(), ca.len())].iter(), ) .map(|ca| ca.into_series()); } diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs index a25ce1937332..7aa77ca23e86 100644 --- a/crates/polars-ops/src/chunked_array/strings/json_path.rs +++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs @@ -204,10 +204,10 @@ mod tests { let expected_series = StructChunked::from_series( "".into(), - &[ + [ Series::new("a".into(), &[None, Some(1), Some(2), None]), Series::new("b".into(), &[None, Some("hello"), Some("goodbye"), None]), - ], + ].iter(), ) .unwrap() .with_outer_validity_chunked(BooleanChunked::new("".into(), [false, true, true, false])) diff --git a/crates/polars-ops/src/chunked_array/strings/split.rs b/crates/polars-ops/src/chunked_array/strings/split.rs index d86e0efac2ae..31c15a70cb08 100644 --- a/crates/polars-ops/src/chunked_array/strings/split.rs +++ b/crates/polars-ops/src/chunked_array/strings/split.rs @@ -149,7 +149,7 @@ where }) .collect::>(); - StructChunked::from_series(ca.name().clone(), &fields) + StructChunked::from_series(ca.name().clone(), fields.iter()) } pub fn split_helper<'a, F, I>(ca: &'a StringChunked, by: &'a StringChunked, op: F) -> ListChunked diff --git a/crates/polars-ops/src/frame/join/merge_sorted.rs b/crates/polars-ops/src/frame/join/merge_sorted.rs index 8ab303fae2c1..a180b293ca0f 100644 --- a/crates/polars-ops/src/frame/join/merge_sorted.rs +++ b/crates/polars-ops/src/frame/join/merge_sorted.rs @@ -85,7 +85,7 @@ fn merge_series(lhs: &Series, rhs: &Series, merge_indicator: &[bool]) -> PolarsR .zip(rhs.fields_as_series()) .map(|(lhs, rhs)| merge_series(lhs, &rhs, merge_indicator)) .collect::>>()?; - StructChunked::from_series(PlSmallStr::EMPTY, &new_fields) + StructChunked::from_series(PlSmallStr::EMPTY, new_fields.iter()) .unwrap() .into_series() }, diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index d681b5db8a90..15753a7c49a7 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -233,15 +233,9 @@ fn pivot_impl( already exists in the DataFrame. Please rename it prior to calling `pivot`.") } // @scalar-opt - let columns_struct = StructChunked::from_series( - column.clone(), - &fields - .iter() - .map(|c| c.as_materialized_series().clone()) - .collect::>(), - ) - .unwrap() - .into_series(); + let columns_struct = StructChunked::from_columns(column.clone(), fields) + .unwrap() + .into_series(); let mut binding = pivot_df.clone(); let pivot_df = unsafe { binding.with_column_unchecked(columns_struct) }; pivot_impl_single_column( diff --git a/crates/polars-ops/src/frame/pivot/positioning.rs b/crates/polars-ops/src/frame/pivot/positioning.rs index 7b19872a1bc3..0e0de1083c5b 100644 --- a/crates/polars-ops/src/frame/pivot/positioning.rs +++ b/crates/polars-ops/src/frame/pivot/positioning.rs @@ -484,14 +484,9 @@ pub(super) fn compute_row_idx( } } else { let binding = pivot_df.select(index.iter().cloned())?; - // @scalar-opt let fields = binding.get_columns(); - let fields = fields - .iter() - .map(|c| c.as_materialized_series().clone()) - .collect::>(); let index_struct_series = - StructChunked::from_series(PlSmallStr::from_static("placeholder"), &fields)? + StructChunked::from_columns(PlSmallStr::from_static("placeholder"), fields)? .into_series(); let index_agg = unsafe { index_struct_series.agg_first(groups) }; let index_agg_physical = index_agg.to_physical_repr(); diff --git a/crates/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs index cba643cf98e9..52cc2ee5a67a 100644 --- a/crates/polars-ops/src/series/ops/cut.rs +++ b/crates/polars-ops/src/series/ops/cut.rs @@ -57,13 +57,13 @@ fn map_cats( }, }); - let outvals = vec![ + let outvals = [ brk_vals.finish().into_series(), bld.finish() ._with_fast_unique(label_has_value.iter().all(bool::clone)) .into_series(), ]; - Ok(StructChunked::from_series(out_name, &outvals)?.into_series()) + Ok(StructChunked::from_series(out_name, outvals.iter())?.into_series()) } else { Ok(bld .drain_iter_and_finish(s_iter.map(|opt| { diff --git a/crates/polars-plan/src/dsl/function_expr/struct_.rs b/crates/polars-plan/src/dsl/function_expr/struct_.rs index 3c72138e6241..acc8020b8e7e 100644 --- a/crates/polars-plan/src/dsl/function_expr/struct_.rs +++ b/crates/polars-plan/src/dsl/function_expr/struct_.rs @@ -176,7 +176,7 @@ pub(super) fn rename_fields(s: &Column, names: Arc<[PlSmallStr]>) -> PolarsResul s }) .collect::>(); - let mut out = StructChunked::from_series(ca.name().clone(), &fields)?; + let mut out = StructChunked::from_series(ca.name().clone(), fields.iter())?; out.zip_outer_validity(ca); Ok(out.into_column()) } @@ -193,7 +193,7 @@ pub(super) fn prefix_fields(s: &Column, prefix: &str) -> PolarsResult { s }) .collect::>(); - let mut out = StructChunked::from_series(ca.name().clone(), &fields)?; + let mut out = StructChunked::from_series(ca.name().clone(), fields.iter())?; out.zip_outer_validity(ca); Ok(out.into_column()) } @@ -210,7 +210,7 @@ pub(super) fn suffix_fields(s: &Column, suffix: &str) -> PolarsResult { s }) .collect::>(); - let mut out = StructChunked::from_series(ca.name().clone(), &fields)?; + let mut out = StructChunked::from_series(ca.name().clone(), fields.iter())?; out.zip_outer_validity(ca); Ok(out.into_column()) } @@ -245,7 +245,7 @@ pub(super) fn with_fields(args: &[Column]) -> PolarsResult { } let new_fields = fields.into_values().cloned().collect::>(); - let mut out = StructChunked::from_series(ca.name().clone(), &new_fields)?; + let mut out = StructChunked::from_series(ca.name().clone(), new_fields.iter())?; out.zip_outer_validity(ca); Ok(out.into_column()) } diff --git a/crates/polars-plan/src/dsl/name.rs b/crates/polars-plan/src/dsl/name.rs index 1df62a767721..1261b4430bec 100644 --- a/crates/polars-plan/src/dsl/name.rs +++ b/crates/polars-plan/src/dsl/name.rs @@ -76,7 +76,7 @@ impl ExprNameNameSpace { fd }) .collect::>(); - let mut out = StructChunked::from_series(s.name().clone(), &fields)?; + let mut out = StructChunked::from_series(s.name().clone(), fields.iter())?; out.zip_outer_validity(s); Ok(Some(out.into_column())) }, diff --git a/crates/polars-python/src/map/mod.rs b/crates/polars-python/src/map/mod.rs index 8f6ed1518fe8..ef1bb4e34507 100644 --- a/crates/polars-python/src/map/mod.rs +++ b/crates/polars-python/src/map/mod.rs @@ -122,7 +122,7 @@ fn iterator_to_struct<'a>( .collect::>() }); - Ok(StructChunked::from_series(name, &fields) + Ok(StructChunked::from_series(name, fields.iter()) .unwrap() .into_series() .into()) diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs index 028595c0bfa9..ae32dd38025c 100644 --- a/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs @@ -2,10 +2,10 @@ use std::sync::Arc; use polars_core::frame::DataFrame; use polars_core::prelude::{ - ArrowField, ArrowSchema, BooleanChunked, ChunkFull, IdxCa, StringChunked, - IntColumn, Column, + AnyValue, ArrowField, ArrowSchema, BooleanChunked, Column, DataType, IdxCa, IntoColumn, }; -use polars_core::series::{IntoSeries, IsSorted, Series}; +use polars_core::scalar::Scalar; +use polars_core::series::{IsSorted, Series}; use polars_core::utils::arrow::bitmap::{Bitmap, MutableBitmap}; use polars_error::{polars_bail, PolarsResult}; use polars_io::predicates::PhysicalIoExpr; @@ -138,25 +138,33 @@ impl RowGroupDecoder { let path_index = row_group_data.path_index; let hive_series = if let Some(hp) = self.hive_partitions.as_deref() { - let mut v = hp[path_index].materialize_partition_columns(); - for s in v.iter_mut() { - *s = s.new_from_index(0, row_group_data.file_max_row_group_height); - } - v + let v = hp[path_index].materialize_partition_columns(); + v.into_iter() + .map(|s| { + s.into_column() + .new_from_index(0, row_group_data.file_max_row_group_height) + }) + .collect() } else { vec![] }; + // @scalar-opt let file_path_series = self.include_file_paths.clone().map(|file_path_col| { - StringChunked::full( + Column::new_scalar( file_path_col, - self.scan_sources - .get(path_index) - .unwrap() - .to_include_path_name(), + Scalar::new( + DataType::String, + AnyValue::StringOwned( + self.scan_sources + .get(path_index) + .unwrap() + .to_include_path_name() + .into(), + ), + ), row_group_data.file_max_row_group_height, ) - .into_series() }); SharedFileState { @@ -170,7 +178,7 @@ impl RowGroupDecoder { &self, row_group_data: &RowGroupData, slice_range: core::ops::Range, - ) -> PolarsResult> { + ) -> PolarsResult> { if let Some(RowIndex { name, offset }) = self.row_index.as_ref() { let projection_height = row_group_data.row_group_metadata.num_rows(); @@ -208,7 +216,7 @@ impl RowGroupDecoder { /// `out_vec`. async fn decode_all_columns( &self, - out_vec: &mut Vec, + out_vec: &mut Vec, row_group_data: &Arc, filter: Option, ) -> PolarsResult<()> { @@ -305,7 +313,7 @@ fn decode_column( arrow_field: &ArrowField, row_group_data: &RowGroupData, filter: Option, -) -> PolarsResult { +) -> PolarsResult { let columns_to_deserialize = row_group_data .row_group_metadata .columns_under_root_iter(&arrow_field.name) @@ -331,16 +339,16 @@ fn decode_column( // TODO: Also load in the metadata. - Ok(series) + Ok(series.into()) } /// # Safety /// All series in `cols` have the same length. async unsafe fn filter_cols( - mut cols: Vec, + mut cols: Vec, mask: &BooleanChunked, min_values_per_thread: usize, -) -> PolarsResult> { +) -> PolarsResult> { if cols.is_empty() { return Ok(cols); } @@ -418,8 +426,8 @@ fn calc_cols_per_thread( /// State shared across row groups for a single file. pub(super) struct SharedFileState { path_index: usize, - hive_series: Vec, - file_path_series: Option, + hive_series: Vec, + file_path_series: Option, } /// @@ -567,7 +575,7 @@ fn decode_column_prefiltered( prefilter_setting: &PrefilterMaskSetting, mask: &BooleanChunked, mask_bitmap: &Bitmap, -) -> PolarsResult { +) -> PolarsResult { let columns_to_deserialize = row_group_data .row_group_metadata .columns_under_root_iter(&arrow_field.name) @@ -594,12 +602,12 @@ fn decode_column_prefiltered( deserialize_filter, )?; - let series = Series::try_from((arrow_field, array))?; + let column = Series::try_from((arrow_field, array))?.into_column(); if !prefilter { - series.filter(mask) + column.filter(mask) } else { - Ok(series) + Ok(column) } } From b4b4d5214476fc227da7c833270d3bafa839c6a7 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 11:08:23 +0200 Subject: [PATCH 30/42] format --- crates/polars-core/src/series/mod.rs | 3 ++- crates/polars-expr/src/expressions/aggregation.rs | 8 +++++--- crates/polars-ops/src/chunked_array/strings/json_path.rs | 3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index bd46b4c6a3d8..5a84ca2f158b 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -630,7 +630,8 @@ impl Series { .iter() .map(|s| s.to_physical_repr().into_owned()) .collect(); - let mut ca = StructChunked::from_series(self.name().clone(), fields.iter()).unwrap(); + let mut ca = + StructChunked::from_series(self.name().clone(), fields.iter()).unwrap(); if arr.null_count() > 0 { ca.zip_outer_validity(arr); diff --git a/crates/polars-expr/src/expressions/aggregation.rs b/crates/polars-expr/src/expressions/aggregation.rs index 8e2563e526e0..cdac9a46610a 100644 --- a/crates/polars-expr/src/expressions/aggregation.rs +++ b/crates/polars-expr/src/expressions/aggregation.rs @@ -502,9 +502,11 @@ impl PartitionedAggregation for AggregationExpr { }; let mut count_s = series.agg_valid_count(groups); count_s.rename(PlSmallStr::from_static("__POLARS_COUNT")); - Ok(StructChunked::from_series(new_name, [agg_s, count_s].iter()) - .unwrap() - .into_series()) + Ok( + StructChunked::from_series(new_name, [agg_s, count_s].iter()) + .unwrap() + .into_series(), + ) } }, GroupByMethod::Implode => { diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs index 7aa77ca23e86..fe8783530d6a 100644 --- a/crates/polars-ops/src/chunked_array/strings/json_path.rs +++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs @@ -207,7 +207,8 @@ mod tests { [ Series::new("a".into(), &[None, Some(1), Some(2), None]), Series::new("b".into(), &[None, Some("hello"), Some("goodbye"), None]), - ].iter(), + ] + .iter(), ) .unwrap() .with_outer_validity_chunked(BooleanChunked::new("".into(), [false, true, true, false])) From 6d3c40e3864966b61933084e79217ae27b707459 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 11:23:34 +0200 Subject: [PATCH 31/42] extract str_value --- crates/polars-core/src/datatypes/any_value.rs | 19 + crates/polars-core/src/fmt.rs | 10 +- .../src/frame/column/arithmetic.rs | 192 +++ crates/polars-core/src/frame/column/mod.rs | 1096 +++++++++++++++++ crates/polars-core/src/series/mod.rs | 15 +- 5 files changed, 1314 insertions(+), 18 deletions(-) create mode 100644 crates/polars-core/src/frame/column/arithmetic.rs create mode 100644 crates/polars-core/src/frame/column/mod.rs diff --git a/crates/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs index 43ca425796df..f7461d370b87 100644 --- a/crates/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + #[cfg(feature = "dtype-struct")] use arrow::legacy::trusted_len::TrustedLenPush; use arrow::types::PrimitiveType; @@ -703,6 +705,23 @@ impl<'a> AnyValue<'a> { _ => panic!("expected index type found {self:?}"), } } + + pub fn str_value(&self) -> Cow<'a, str> { + match self { + Self::String(s) => Cow::Borrowed(s), + Self::StringOwned(s) => Cow::Owned(s.to_string()), + Self::Null => Cow::Borrowed("null"), + #[cfg(feature = "dtype-categorical")] + Self::Categorical(idx, rev, arr) | AnyValue::Enum(idx, rev, arr) => { + if arr.is_null() { + Cow::Borrowed(rev.get(*idx)) + } else { + unsafe { Cow::Borrowed(arr.deref_unchecked().value(*idx as usize)) } + } + }, + av => Cow::Owned(av.to_string()), + } + } } impl From> for DataType { diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index bcb66b441ccd..c8ab72d02031 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -616,8 +616,9 @@ impl Display for DataFrame { for i in 0..(half + rest) { let row = self - .materialized_column_iter() - .map(|s| s.str_value(i).unwrap()) + .get_columns() + .iter() + .map(|c| c.str_value(i).unwrap()) .collect(); let row_strings = @@ -629,8 +630,9 @@ impl Display for DataFrame { rows.push(dots); for i in (height - half)..height { let row = self - .materialized_column_iter() - .map(|s| s.str_value(i).unwrap()) + .get_columns() + .iter() + .map(|c| c.str_value(i).unwrap()) .collect(); let row_strings = diff --git a/crates/polars-core/src/frame/column/arithmetic.rs b/crates/polars-core/src/frame/column/arithmetic.rs new file mode 100644 index 000000000000..641d74f5ae6b --- /dev/null +++ b/crates/polars-core/src/frame/column/arithmetic.rs @@ -0,0 +1,192 @@ +use std::ops::{Add, Div, Mul, Rem, Sub}; + +use num_traits::{Num, NumCast}; +use polars_error::PolarsResult; + +use super::Column; + +impl Add for Column { + type Output = PolarsResult; + + fn add(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .add(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Add for &Column { + type Output = PolarsResult; + + fn add(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .add(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Sub for Column { + type Output = PolarsResult; + + fn sub(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .sub(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Sub for &Column { + type Output = PolarsResult; + + fn sub(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .sub(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Mul for Column { + type Output = PolarsResult; + + fn mul(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .mul(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Mul for &Column { + type Output = PolarsResult; + + fn mul(self, rhs: Self) -> Self::Output { + // @scalar-opt + self.as_materialized_series() + .mul(rhs.as_materialized_series()) + .map(Column::from) + } +} + +impl Sub for &Column +where + T: Num + NumCast, +{ + type Output = Column; + + fn sub(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().sub(rhs).into() + } +} + +impl Sub for Column +where + T: Num + NumCast, +{ + type Output = Self; + + fn sub(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().sub(rhs).into() + } +} + +impl Add for &Column +where + T: Num + NumCast, +{ + type Output = Column; + + fn add(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().add(rhs).into() + } +} + +impl Add for Column +where + T: Num + NumCast, +{ + type Output = Self; + + fn add(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().add(rhs).into() + } +} + +impl Div for &Column +where + T: Num + NumCast, +{ + type Output = Column; + + fn div(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().div(rhs).into() + } +} + +impl Div for Column +where + T: Num + NumCast, +{ + type Output = Self; + + fn div(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().div(rhs).into() + } +} + +impl Mul for &Column +where + T: Num + NumCast, +{ + type Output = Column; + + fn mul(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().mul(rhs).into() + } +} + +impl Mul for Column +where + T: Num + NumCast, +{ + type Output = Self; + + fn mul(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().mul(rhs).into() + } +} + +impl Rem for &Column +where + T: Num + NumCast, +{ + type Output = Column; + + fn rem(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().rem(rhs).into() + } +} + +impl Rem for Column +where + T: Num + NumCast, +{ + type Output = Self; + + fn rem(self, rhs: T) -> Self::Output { + // @scalar-opt + self.as_materialized_series().rem(rhs).into() + } +} diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs new file mode 100644 index 000000000000..aac04c7879a6 --- /dev/null +++ b/crates/polars-core/src/frame/column/mod.rs @@ -0,0 +1,1096 @@ +use std::borrow::Cow; +use std::sync::OnceLock; + +use num_traits::{Num, NumCast}; +use polars_error::PolarsResult; +use polars_utils::index::check_bounds; +use polars_utils::pl_str::PlSmallStr; + +use self::gather::check_bounds_ca; +use crate::chunked_array::metadata::MetadataFlags; +use crate::prelude::*; +use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; +use crate::utils::{slice_offsets, Container}; +use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH}; + +mod arithmetic; + +/// A column within a [`DataFrame`]. +/// +/// This is lazily initialized to a [`Series`] with methods like +/// [`as_materialized_series`][Column::as_materialized_series] and +/// [`take_materialized_series`][Column::take_materialized_series]. +/// +/// Currently, there are two ways to represent a [`Column`]. +/// 1. A [`Series`] of values +/// 2. A [`ScalarColumn`] that repeats a single [`Scalar`] +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] +#[cfg_attr(feature = "serde", serde(from = "Series"))] +#[cfg_attr(feature = "serde", serde(into = "_SerdeSeries"))] +pub enum Column { + Series(Series), + Scalar(ScalarColumn), +} + +/// A [`Column`] that consists of a repeated [`Scalar`] +/// +/// This is lazily materialized into a [`Series`]. +#[derive(Debug, Clone)] +pub struct ScalarColumn { + name: PlSmallStr, + value: Scalar, + length: usize, + + // invariants: + // materialized.name() == name + // materialized.len() == length + // materialized.dtype() == value.dtype + // materialized[i] == value, for all 0 <= i < length + /// A lazily materialized [`Series`] variant of this [`ScalarColumn`] + materialized: OnceLock, +} + +/// Convert `Self` into a [`Column`] +pub trait IntoColumn: Sized { + fn into_column(self) -> Column; +} + +impl Column { + #[inline] + pub fn new(name: PlSmallStr, values: T) -> Self + where + Phantom: ?Sized, + Series: NamedFrom, + { + Self::Series(NamedFrom::new(name, values)) + } + + #[inline] + pub fn new_empty(name: PlSmallStr, dtype: &DataType) -> Self { + Self::new_scalar(name, Scalar::new(dtype.clone(), AnyValue::Null), 0) + } + + #[inline] + pub fn new_scalar(name: PlSmallStr, value: Scalar, length: usize) -> Self { + Self::Scalar(ScalarColumn::new(name, value, length)) + } + + // # Materialize + /// Get a reference to a [`Series`] for this [`Column`] + /// + /// This may need to materialize the [`Series`] on the first invocation for a specific column. + #[inline] + pub fn as_materialized_series(&self) -> &Series { + match self { + Column::Series(s) => s, + Column::Scalar(s) => s.as_materialized_series(), + } + } + /// Turn [`Column`] into a [`Column::Series`]. + /// + /// This may need to materialize the [`Series`] on the first invocation for a specific column. + #[inline] + pub fn into_materialized_series(&mut self) -> &mut Series { + match self { + Column::Series(s) => s, + Column::Scalar(s) => { + let series = s.materialized.take().unwrap_or_else(|| s.to_series()); + *self = Column::Series(series); + let Column::Series(s) = self else { + unreachable!(); + }; + s + }, + } + } + /// Take [`Series`] from a [`Column`] + /// + /// This may need to materialize the [`Series`] on the first invocation for a specific column. + #[inline] + pub fn take_materialized_series(self) -> Series { + match self { + Column::Series(s) => s, + Column::Scalar(s) => s.take_materialized_series(), + } + } + + #[inline] + pub fn dtype(&self) -> &DataType { + match self { + Column::Series(s) => s.dtype(), + Column::Scalar(s) => s.value.dtype(), + } + } + + #[inline] + pub fn field(&self) -> Cow { + match self { + Column::Series(s) => s.field(), + Column::Scalar(s) => match s.materialized.get() { + None => Cow::Owned(Field::new(s.name.clone(), s.value.dtype().clone())), + Some(s) => s.field(), + }, + } + } + + #[inline] + pub fn name(&self) -> &PlSmallStr { + match self { + Column::Series(s) => s.name(), + Column::Scalar(s) => &s.name, + } + } + + #[inline] + pub fn len(&self) -> usize { + match self { + Column::Series(s) => s.len(), + Column::Scalar(s) => s.length, + } + } + + #[inline] + pub fn with_name(mut self, name: PlSmallStr) -> Column { + self.rename(name); + self + } + + #[inline] + pub fn rename(&mut self, name: PlSmallStr) { + match self { + Column::Series(s) => _ = s.rename(name), + Column::Scalar(s) => { + if let Some(series) = s.materialized.get_mut() { + series.rename(name.clone()); + } + + s.name = name; + }, + } + } + + // # Downcasting + #[inline] + pub fn as_series(&self) -> Option<&Series> { + match self { + Column::Series(s) => Some(s), + Column::Scalar(_) => None, + } + } + #[inline] + pub fn as_scalar_column(&self) -> Option<&ScalarColumn> { + match self { + Column::Series(_) => None, + Column::Scalar(s) => Some(s), + } + } + + // # To Chunked Arrays + pub fn bool(&self) -> PolarsResult<&BooleanChunked> { + // @scalar-opt + self.as_materialized_series().bool() + } + pub fn i8(&self) -> PolarsResult<&Int8Chunked> { + // @scalar-opt + self.as_materialized_series().i8() + } + pub fn i16(&self) -> PolarsResult<&Int16Chunked> { + // @scalar-opt + self.as_materialized_series().i16() + } + pub fn i32(&self) -> PolarsResult<&Int32Chunked> { + // @scalar-opt + self.as_materialized_series().i32() + } + pub fn i64(&self) -> PolarsResult<&Int64Chunked> { + // @scalar-opt + self.as_materialized_series().i64() + } + pub fn u8(&self) -> PolarsResult<&UInt8Chunked> { + // @scalar-opt + self.as_materialized_series().u8() + } + pub fn u16(&self) -> PolarsResult<&UInt16Chunked> { + // @scalar-opt + self.as_materialized_series().u16() + } + pub fn u32(&self) -> PolarsResult<&UInt32Chunked> { + // @scalar-opt + self.as_materialized_series().u32() + } + pub fn u64(&self) -> PolarsResult<&UInt64Chunked> { + // @scalar-opt + self.as_materialized_series().u64() + } + pub fn f32(&self) -> PolarsResult<&Float32Chunked> { + // @scalar-opt + self.as_materialized_series().f32() + } + pub fn f64(&self) -> PolarsResult<&Float64Chunked> { + // @scalar-opt + self.as_materialized_series().f64() + } + pub fn str(&self) -> PolarsResult<&StringChunked> { + // @scalar-opt + self.as_materialized_series().str() + } + pub fn list(&self) -> PolarsResult<&ListChunked> { + // @scalar-opt + self.as_materialized_series().list() + } + pub fn binary(&self) -> PolarsResult<&BinaryChunked> { + // @scalar-opt + self.as_materialized_series().binary() + } + pub fn idx(&self) -> PolarsResult<&IdxCa> { + // @scalar-opt + self.as_materialized_series().idx() + } + pub fn binary_offset(&self) -> PolarsResult<&BinaryOffsetChunked> { + // @scalar-opt + self.as_materialized_series().binary_offset() + } + #[cfg(feature = "dtype-datetime")] + pub fn datetime(&self) -> PolarsResult<&DatetimeChunked> { + // @scalar-opt + self.as_materialized_series().datetime() + } + #[cfg(feature = "dtype-struct")] + pub fn struct_(&self) -> PolarsResult<&StructChunked> { + // @scalar-opt + self.as_materialized_series().struct_() + } + #[cfg(feature = "dtype-decimal")] + pub fn decimal(&self) -> PolarsResult<&DecimalChunked> { + // @scalar-opt + self.as_materialized_series().decimal() + } + #[cfg(feature = "dtype-array")] + pub fn array(&self) -> PolarsResult<&ArrayChunked> { + // @scalar-opt + self.as_materialized_series().array() + } + #[cfg(feature = "dtype-categorical")] + pub fn categorical(&self) -> PolarsResult<&CategoricalChunked> { + self.as_materialized_series().categorical() + } + #[cfg(feature = "dtype-date")] + pub fn date(&self) -> PolarsResult<&DateChunked> { + // @scalar-opt + self.as_materialized_series().date() + } + #[cfg(feature = "dtype-duration")] + pub fn duration(&self) -> PolarsResult<&DurationChunked> { + // @scalar-opt + self.as_materialized_series().duration() + } + + // # Casting + pub fn strict_cast(&self, dtype: &DataType) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .strict_cast(dtype) + .map(Column::from) + } + pub fn cast(&self, dtype: &DataType) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().cast(dtype).map(Column::from) + } + /// # Safety + /// + /// This can lead to invalid memory access in downstream code. + pub unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult { + // @scalar-opt + unsafe { self.as_materialized_series().cast_unchecked(dtype) }.map(Column::from) + } + + pub fn clear(&self) -> Self { + match self { + Column::Series(s) => s.clear().into(), + Column::Scalar(s) => Self::new_scalar(s.name.clone(), s.value.clone(), 0), + } + } + + #[inline] + pub fn shrink_to_fit(&mut self) { + match self { + Column::Series(s) => s.shrink_to_fit(), + Column::Scalar(_) => {}, + } + } + + #[inline] + pub fn new_from_index(&self, index: usize, length: usize) -> Self { + match self { + Column::Series(s) => s.new_from_index(index, length).into(), + Column::Scalar(s) => { + if index >= s.length { + Self::full_null(s.name.clone(), length, s.value.dtype()) + } else { + s.resize(length).into() + } + }, + } + } + + #[inline] + pub fn has_nulls(&self) -> bool { + match self { + Self::Series(s) => s.has_nulls(), + Self::Scalar(s) => s.has_nulls(), + } + } + + #[inline] + pub fn is_null(&self) -> BooleanChunked { + match self { + Self::Series(s) => s.is_null(), + Self::Scalar(s) => BooleanChunked::full(s.name.clone(), s.value.is_null(), s.length), + } + } + #[inline] + pub fn is_not_null(&self) -> BooleanChunked { + match self { + Self::Series(s) => s.is_not_null(), + Self::Scalar(s) => BooleanChunked::full(s.name.clone(), !s.value.is_null(), s.length), + } + } + + pub fn to_physical_repr(&self) -> Column { + // @scalar-opt + self.as_materialized_series() + .to_physical_repr() + .into_owned() + .into() + } + + pub fn head(&self, length: Option) -> Column { + let len = length.unwrap_or(HEAD_DEFAULT_LENGTH); + let len = usize::min(len, self.len()); + self.slice(0, len) + } + pub fn tail(&self, length: Option) -> Column { + let len = length.unwrap_or(TAIL_DEFAULT_LENGTH); + let len = usize::min(len, self.len()); + debug_assert!(len <= i64::MAX as usize); + self.slice(-(len as i64), len) + } + /// # Safety + /// + /// No bounds checks are performed on offset and length + pub fn slice(&self, offset: i64, length: usize) -> Column { + match self { + Column::Series(s) => s.slice(offset, length).into(), + Column::Scalar(s) => { + let (_, length) = slice_offsets(offset, length, s.length); + s.resize(length).into() + }, + } + } + + pub fn split_at(&self, offset: i64) -> (Column, Column) { + // @scalar-opt + let (l, r) = self.as_materialized_series().split_at(offset); + (l.into(), r.into()) + } + + #[inline] + pub fn null_count(&self) -> usize { + match self { + Self::Series(s) => s.null_count(), + Self::Scalar(s) if s.value.is_null() => s.length, + Self::Scalar(_) => 0, + } + } + + pub fn take(&self, indices: &IdxCa) -> PolarsResult { + check_bounds_ca(indices, self.len() as IdxSize)?; + Ok(unsafe { self.take_unchecked(indices) }) + } + pub fn take_slice(&self, indices: &[IdxSize]) -> PolarsResult { + check_bounds(indices, self.len() as IdxSize)?; + Ok(unsafe { self.take_slice_unchecked(indices) }) + } + /// # Safety + /// + /// No bounds on the indexes are performed. + pub unsafe fn take_unchecked(&self, indices: &IdxCa) -> Column { + debug_assert!(check_bounds_ca(indices, self.len() as IdxSize).is_ok()); + + match self { + Self::Series(s) => unsafe { s.take_unchecked(indices) }.into(), + Self::Scalar(s) => s.resize(indices.len()).into(), + } + } + /// # Safety + /// + /// No bounds on the indexes are performed. + pub unsafe fn take_slice_unchecked(&self, indices: &[IdxSize]) -> Column { + debug_assert!(check_bounds(indices, self.len() as IdxSize).is_ok()); + + match self { + Self::Series(s) => unsafe { s.take_unchecked_from_slice(indices) }.into(), + Self::Scalar(s) => s.resize(indices.len()).into(), + } + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] + pub unsafe fn agg_min(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_min(groups) }.into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] + pub unsafe fn agg_max(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_max(groups) }.into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] + pub unsafe fn agg_mean(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_mean(groups) }.into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] + pub unsafe fn agg_sum(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_sum(groups) }.into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + pub unsafe fn agg_first(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_first(groups) }.into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + pub unsafe fn agg_last(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_last(groups) }.into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + pub unsafe fn agg_n_unique(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_n_unique(groups) }.into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + pub unsafe fn agg_quantile( + &self, + groups: &GroupsProxy, + quantile: f64, + interpol: QuantileInterpolOptions, + ) -> Self { + // @scalar-opt + unsafe { + self.as_materialized_series() + .agg_quantile(groups, quantile, interpol) + } + .into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] + pub unsafe fn agg_median(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_median(groups) }.into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] + pub unsafe fn agg_var(&self, groups: &GroupsProxy, ddof: u8) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_var(groups, ddof) }.into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] + pub(crate) unsafe fn agg_std(&self, groups: &GroupsProxy, ddof: u8) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_std(groups, ddof) }.into() + } + + /// # Safety + /// + /// Does no bounds checks, groups must be correct. + #[cfg(feature = "algorithm_group_by")] + pub unsafe fn agg_list(&self, groups: &GroupsProxy) -> Self { + // @scalar-opt + unsafe { self.as_materialized_series().agg_list(groups) }.into() + } + + pub fn full_null(name: PlSmallStr, size: usize, dtype: &DataType) -> Self { + Series::full_null(name, size, dtype).into() + // @TODO: This causes failures + // Self::new_scalar(name, Scalar::new(dtype.clone(), AnyValue::Null), size) + } + + pub fn is_empty(&self) -> bool { + // @scalar-opt + self.as_materialized_series().is_empty() + } + + pub fn reverse(&self) -> Column { + match self { + Column::Series(s) => s.reverse().into(), + Column::Scalar(_) => self.clone(), + } + } + + pub fn equals(&self, right: &Column) -> bool { + // @scalar-opt + self.as_materialized_series() + .equals(right.as_materialized_series()) + } + + pub fn equals_missing(&self, right: &Column) -> bool { + // @scalar-opt + self.as_materialized_series() + .equals_missing(right.as_materialized_series()) + } + + pub fn set_sorted_flag(&mut self, sorted: IsSorted) { + // @scalar-opt + match self { + Column::Series(s) => s.set_sorted_flag(sorted), + Column::Scalar(_) => {}, + } + } + + pub fn get_flags(&self) -> MetadataFlags { + match self { + Column::Series(s) => s.get_flags(), + // @scalar-opt + Column::Scalar(_) => MetadataFlags::empty(), + } + } + + pub fn get_data_ptr(&self) -> usize { + // @scalar-opt + self.as_materialized_series().get_data_ptr() + } + + pub fn vec_hash(&self, build_hasher: PlRandomState, buf: &mut Vec) -> PolarsResult<()> { + // @scalar-opt? + self.as_materialized_series().vec_hash(build_hasher, buf) + } + + pub fn vec_hash_combine( + &self, + build_hasher: PlRandomState, + hashes: &mut [u64], + ) -> PolarsResult<()> { + // @scalar-opt? + self.as_materialized_series() + .vec_hash_combine(build_hasher, hashes) + } + + /// # Safety + /// + /// Indexes need to be in bounds. + pub(crate) unsafe fn equal_element( + &self, + idx_self: usize, + idx_other: usize, + other: &Column, + ) -> bool { + // @scalar-opt + unsafe { + self.as_materialized_series().equal_element( + idx_self, + idx_other, + other.as_materialized_series(), + ) + } + } + + pub fn append(&mut self, other: &Column) -> PolarsResult<&mut Self> { + // @scalar-opt + self.into_materialized_series() + .append(other.as_materialized_series())?; + Ok(self) + } + + pub fn arg_sort(&self, options: SortOptions) -> IdxCa { + // @scalar-opt + self.as_materialized_series().arg_sort(options) + } + + pub fn bit_repr(&self) -> Option { + // @scalar-opt + self.as_materialized_series().bit_repr() + } + + pub fn into_frame(self) -> DataFrame { + // SAFETY: A single-column dataframe cannot have length mismatches or duplicate names + unsafe { DataFrame::new_no_checks(vec![self]) } + } + + pub fn unique_stable(&self) -> PolarsResult { + // @scalar-opt? + self.as_materialized_series() + .unique_stable() + .map(Column::from) + } + + pub fn extend(&mut self, other: &Column) -> PolarsResult<&mut Self> { + // @scalar-opt + self.into_materialized_series() + .extend(other.as_materialized_series())?; + Ok(self) + } + + pub fn rechunk(&self) -> Column { + match self { + Column::Series(s) => s.rechunk().into(), + Column::Scalar(_) => self.clone(), + } + } + + pub fn explode(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().explode().map(Column::from) + } + + pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .fill_null(strategy) + .map(Column::from) + } + + pub fn divide(&self, rhs: &Column) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .divide(rhs.as_materialized_series()) + .map(Column::from) + } + + pub fn shift(&self, periods: i64) -> Column { + // @scalar-opt + self.as_materialized_series().shift(periods).into() + } + + #[cfg(feature = "zip_with")] + pub fn zip_with_same_type( + &self, + mask: &ChunkedArray, + other: &Column, + ) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .zip_with_same_type(mask, other.as_materialized_series()) + .map(Column::from) + } + + pub fn drop_nulls(&self) -> Column { + // @scalar-opt + self.as_materialized_series().drop_nulls().into() + } + + pub fn is_sorted_flag(&self) -> IsSorted { + // @scalar-opt + self.as_materialized_series().is_sorted_flag() + } + + pub fn unique(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().unique().map(Column::from) + } + + pub fn reshape_list(&self, dimensions: &[i64]) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .reshape_list(dimensions) + .map(Self::from) + } + + #[cfg(feature = "dtype-array")] + pub fn reshape_array(&self, dimensions: &[i64]) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .reshape_array(dimensions) + .map(Self::from) + } + + pub fn sort(&self, sort_options: SortOptions) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .sort(sort_options) + .map(Self::from) + } + + pub fn filter(&self, filter: &ChunkedArray) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().filter(filter).map(Self::from) + } + + #[cfg(feature = "random")] + pub fn shuffle(&self, seed: Option) -> Self { + // @scalar-opt + self.as_materialized_series().shuffle(seed).into() + } + + #[cfg(feature = "random")] + pub fn sample_frac( + &self, + frac: f64, + with_replacement: bool, + shuffle: bool, + seed: Option, + ) -> PolarsResult { + self.as_materialized_series() + .sample_frac(frac, with_replacement, shuffle, seed) + .map(Self::from) + } + + #[cfg(feature = "random")] + pub fn sample_n( + &self, + n: usize, + with_replacement: bool, + shuffle: bool, + seed: Option, + ) -> PolarsResult { + self.as_materialized_series() + .sample_n(n, with_replacement, shuffle, seed) + .map(Self::from) + } + + pub fn gather_every(&self, n: usize, offset: usize) -> Column { + // @scalar-opt + self.as_materialized_series().gather_every(n, offset).into() + } + + pub fn extend_constant(&self, value: AnyValue, n: usize) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .extend_constant(value, n) + .map(Self::from) + } + + #[cfg(feature = "zip_with")] + pub fn zip_with(&self, mask: &BooleanChunked, other: &Self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .zip_with(mask, other.as_materialized_series()) + .map(Self::from) + } + + pub fn is_finite(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().is_finite() + } + + pub fn is_infinite(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().is_infinite() + } + + pub fn is_nan(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().is_nan() + } + + pub fn is_not_nan(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().is_not_nan() + } + + pub fn wrapping_trunc_div_scalar(&self, rhs: T) -> Self + where + T: Num + NumCast, + { + // @scalar-opt + self.as_materialized_series() + .wrapping_trunc_div_scalar(rhs) + .into() + } + + pub fn product(&self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series().product() + } + + pub fn phys_iter(&self) -> SeriesPhysIter<'_> { + // @scalar-opt + self.as_materialized_series().phys_iter() + } + + #[inline] + pub fn get(&self, index: usize) -> PolarsResult { + polars_ensure!(index < self.len(), oob = index, self.len()); + + // SAFETY: Bounds check done just before. + Ok(unsafe { self.get_unchecked(index) }) + } + /// # Safety + /// + /// Does not perform bounds check on `index` + #[inline(always)] + pub unsafe fn get_unchecked(&self, index: usize) -> AnyValue { + debug_assert!(index < self.len()); + + match self { + Column::Series(s) => s.get_unchecked(index), + Column::Scalar(s) => s.value.as_any_value(), + } + } + + #[cfg(feature = "object")] + pub fn get_object( + &self, + index: usize, + ) -> Option<&dyn crate::chunked_array::object::PolarsObjectSafe> { + self.as_materialized_series().get_object(index) + } + + pub fn bitand(&self, rhs: &Self) -> PolarsResult { + self.as_materialized_series() + .bitand(rhs.as_materialized_series()) + .map(Column::from) + } + + pub(crate) fn str_value(&self, index: usize) -> PolarsResult> { + Ok(self.get(index)?.str_value()) + } +} + +impl ChunkCompare<&Column> for Column { + type Item = PolarsResult; + + /// Create a boolean mask by checking for equality. + #[inline] + fn equal(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series() + .equal(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking for equality. + #[inline] + fn equal_missing(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series() + .equal_missing(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking for inequality. + #[inline] + fn not_equal(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series() + .not_equal(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking for inequality. + #[inline] + fn not_equal_missing(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series() + .not_equal_missing(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking if self > rhs. + #[inline] + fn gt(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series() + .gt(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking if self >= rhs. + #[inline] + fn gt_eq(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series() + .gt_eq(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking if self < rhs. + #[inline] + fn lt(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series() + .lt(rhs.as_materialized_series()) + } + + /// Create a boolean mask by checking if self <= rhs. + #[inline] + fn lt_eq(&self, rhs: &Column) -> PolarsResult { + self.as_materialized_series() + .lt_eq(rhs.as_materialized_series()) + } +} + +impl Default for Column { + fn default() -> Self { + // @scalar-opt + Column::Series(Series::default()) + } +} + +impl PartialEq for Column { + fn eq(&self, other: &Self) -> bool { + // @scalar-opt + self.as_materialized_series() + .eq(other.as_materialized_series()) + } +} + +impl From for Column { + #[inline] + fn from(series: Series) -> Self { + if series.len() == 1 { + // SAFETY: We just did the bounds check + let value = unsafe { series.get_unchecked(0) }; + + if let Ok(value) = value.into_static() { + let value = Scalar::new(series.dtype().clone(), value); + let mut col = ScalarColumn::new(series.name().clone(), value, 1); + col.materialized = OnceLock::from(series); + return Self::Scalar(col); + } + } + + Self::Series(series) + } +} + +impl From for Column { + #[inline] + fn from(value: ScalarColumn) -> Self { + Self::Scalar(value) + } +} + +impl ScalarColumn { + #[inline] + pub fn new(name: PlSmallStr, value: Scalar, length: usize) -> Self { + Self { + name, + value, + length, + + materialized: OnceLock::new(), + } + } + + fn _to_series(name: PlSmallStr, value: Scalar, length: usize) -> Series { + let series = if length == 0 { + Series::new_empty(name, value.dtype()) + } else { + value.into_series(name).new_from_index(0, length) + }; + + debug_assert_eq!(series.len(), length); + + series + } + + pub fn to_series(&self) -> Series { + Self::_to_series(self.name.clone(), self.value.clone(), self.length) + } + + /// Get the [`ScalarColumn`] as [`Series`] + /// + /// This needs to materialize upon the first call. Afterwards, this is cached. + pub fn as_materialized_series(&self) -> &Series { + self.materialized.get_or_init(|| self.to_series()) + } + + /// Take the [`ScalarColumn`] and materialize as a [`Series`] if not already done. + pub fn take_materialized_series(self) -> Series { + self.materialized + .into_inner() + .unwrap_or_else(|| Self::_to_series(self.name, self.value, self.length)) + } + + pub fn resize(&self, length: usize) -> ScalarColumn { + let mut sliced = Self { + name: self.name.clone(), + value: self.value.clone(), + length, + materialized: OnceLock::new(), + }; + + if self.length >= length { + if let Some(materialized) = self.materialized.get() { + sliced.materialized = OnceLock::from(materialized.head(Some(length))); + debug_assert_eq!(sliced.materialized.get().unwrap().len(), length); + } + } + + sliced + } + + pub fn has_nulls(&self) -> bool { + self.length != 0 && self.value.is_null() + } +} + +impl IntoColumn for T { + #[inline] + fn into_column(self) -> Column { + self.into_series().into() + } +} + +impl IntoColumn for Column { + #[inline(always)] + fn into_column(self) -> Column { + self + } +} + +impl IntoColumn for ScalarColumn { + #[inline(always)] + fn into_column(self) -> Column { + self.into() + } +} + +/// We don't want to serialize the scalar columns. So this helps pretend that columns are always +/// initialized without implementing From for Series. +/// +/// Those casts should be explicit. +#[derive(Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize))] +#[cfg_attr(feature = "serde", serde(into = "Series"))] +struct _SerdeSeries(Series); + +impl From for _SerdeSeries { + #[inline] + fn from(value: Column) -> Self { + Self(value.take_materialized_series()) + } +} + +impl From<_SerdeSeries> for Series { + #[inline] + fn from(value: _SerdeSeries) -> Self { + value.0 + } +} diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 5a84ca2f158b..274b1e310282 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -802,20 +802,7 @@ impl Series { // used for formatting pub fn str_value(&self, index: usize) -> PolarsResult> { - let out = match self.0.get(index)? { - AnyValue::String(s) => Cow::Borrowed(s), - AnyValue::Null => Cow::Borrowed("null"), - #[cfg(feature = "dtype-categorical")] - AnyValue::Categorical(idx, rev, arr) | AnyValue::Enum(idx, rev, arr) => { - if arr.is_null() { - Cow::Borrowed(rev.get(idx)) - } else { - unsafe { Cow::Borrowed(arr.deref_unchecked().value(idx as usize)) } - } - }, - av => Cow::Owned(format!("{av}")), - }; - Ok(out) + Ok(self.0.get(index)?.str_value()) } /// Get the head of the Series. pub fn head(&self, length: Option) -> Series { From fcfd94733ba961d40e4c8895157966a28590649c Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 12:54:08 +0200 Subject: [PATCH 32/42] scalar-opt Column arithmetic --- .../src/frame/column/arithmetic.rs | 342 ++++++++---------- crates/polars-core/src/frame/column/mod.rs | 29 ++ crates/polars-io/src/utils/other.rs | 6 +- crates/polars-lazy/src/tests/aggregations.rs | 2 +- crates/polars-lazy/src/tests/queries.rs | 4 +- crates/polars-ops/src/series/ops/duration.rs | 16 +- 6 files changed, 195 insertions(+), 204 deletions(-) diff --git a/crates/polars-core/src/frame/column/arithmetic.rs b/crates/polars-core/src/frame/column/arithmetic.rs index 641d74f5ae6b..79fd0053b320 100644 --- a/crates/polars-core/src/frame/column/arithmetic.rs +++ b/crates/polars-core/src/frame/column/arithmetic.rs @@ -1,192 +1,154 @@ -use std::ops::{Add, Div, Mul, Rem, Sub}; - use num_traits::{Num, NumCast}; -use polars_error::PolarsResult; - -use super::Column; - -impl Add for Column { - type Output = PolarsResult; - - fn add(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .add(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Add for &Column { - type Output = PolarsResult; - - fn add(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .add(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Sub for Column { - type Output = PolarsResult; - - fn sub(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .sub(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Sub for &Column { - type Output = PolarsResult; - - fn sub(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .sub(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Mul for Column { - type Output = PolarsResult; - - fn mul(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .mul(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Mul for &Column { - type Output = PolarsResult; - - fn mul(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .mul(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Sub for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn sub(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().sub(rhs).into() - } -} - -impl Sub for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn sub(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().sub(rhs).into() - } -} - -impl Add for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn add(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().add(rhs).into() - } -} - -impl Add for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn add(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().add(rhs).into() - } -} - -impl Div for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn div(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().div(rhs).into() - } -} - -impl Div for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn div(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().div(rhs).into() - } -} - -impl Mul for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn mul(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().mul(rhs).into() - } -} - -impl Mul for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn mul(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().mul(rhs).into() - } -} - -impl Rem for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn rem(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().rem(rhs).into() - } -} - -impl Rem for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn rem(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().rem(rhs).into() - } +use polars_error::{polars_bail, PolarsResult}; + +use super::{Column, ScalarColumn, Series}; +use crate::utils::Container; + +fn output_length(a: &Column, b: &Column) -> PolarsResult { + match (a.len(), b.len()) { + // broadcasting + (1, o) | (o, 1) => Ok(o), + // equal + (a, b) if a == b => Ok(a), + // unequal + (a, b) => { + polars_bail!(InvalidOperation: "cannot do arithmetic operation on series of different lengths: got {} and {}", a, b) + }, + } +} + +fn unit_series_op PolarsResult>( + l: &Series, + r: &Series, + op: F, + length: usize, +) -> PolarsResult { + debug_assert!(l.len() <= 1); + debug_assert!(r.len() <= 1); + + op(l, r) + .and_then(|s| ScalarColumn::from_single_value_series(s, length)) + .map(Column::from) +} + +fn op_with_broadcast PolarsResult>( + l: &Column, + r: &Column, + op: F, +) -> PolarsResult { + // Here we rely on the underlying broadcast operations. + + let length = output_length(l, r)?; + match (l, r) { + (Column::Series(l), Column::Series(r)) => op(l, r).map(Column::from), + (Column::Series(l), Column::Scalar(r)) => { + let r = r.as_single_value_series(); + if l.len() == 1 { + unit_series_op(l, &r, op, length) + } else { + op(l, &r).map(Column::from) + } + }, + (Column::Scalar(l), Column::Series(r)) => { + let l = l.as_single_value_series(); + if r.len() == 1 { + unit_series_op(&l, r, op, length) + } else { + op(&l, r).map(Column::from) + } + }, + (Column::Scalar(l), Column::Scalar(r)) => unit_series_op( + &l.as_single_value_series(), + &r.as_single_value_series(), + op, + length, + ), + } +} + +fn num_op_with_broadcast Series>( + c: &'_ Column, + n: T, + op: F, +) -> PolarsResult { + match c { + Column::Series(s) => Ok(op(s, n).into()), + Column::Scalar(s) => { + ScalarColumn::from_single_value_series(op(&s.as_single_value_series(), n), s.length) + .map(Column::from) + }, + } +} + +macro_rules! broadcastable_ops { + ($(($trait:ident, $op:ident))+) => { + $( + impl std::ops::$trait for Column { + type Output = PolarsResult; + + #[inline] + fn $op(self, rhs: Self) -> Self::Output { + op_with_broadcast(&self, &rhs, |l, r| l.$op(r)) + } + } + + impl std::ops::$trait for &Column { + type Output = PolarsResult; + + #[inline] + fn $op(self, rhs: Self) -> Self::Output { + op_with_broadcast(self, rhs, |l, r| l.$op(r)) + } + } + )+ + } +} + +macro_rules! broadcastable_num_ops { + ($(($trait:ident, $op:ident))+) => { + $( + impl std::ops::$trait:: for Column + where + T: Num + NumCast, + { + type Output = PolarsResult; + + #[inline] + fn $op(self, rhs: T) -> Self::Output { + num_op_with_broadcast(&self, rhs, |l, r| l.$op(r)) + } + } + + impl std::ops::$trait:: for &Column + where + T: Num + NumCast, + { + type Output = PolarsResult; + + #[inline] + fn $op(self, rhs: T) -> Self::Output { + num_op_with_broadcast(self, rhs, |l, r| l.$op(r)) + } + } + )+ + }; +} + +broadcastable_ops! { + (Add, add) + (Sub, sub) + (Mul, mul) + (Div, div) + (Rem, rem) + (BitAnd, bitand) + (BitOr, bitor) + (BitXor, bitxor) +} + +broadcastable_num_ops! { + (Add, add) + (Sub, sub) + (Mul, mul) + (Div, div) + (Rem, rem) } diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index aac04c7879a6..05de2185956b 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -1010,6 +1010,7 @@ impl ScalarColumn { series } + /// Materialize the [`ScalarColumn`] into a [`Series`]. pub fn to_series(&self) -> Series { Self::_to_series(self.name.clone(), self.value.clone(), self.length) } @@ -1028,6 +1029,34 @@ impl ScalarColumn { .unwrap_or_else(|| Self::_to_series(self.name, self.value, self.length)) } + /// Take the [`ScalarColumn`] as a series with a single value. + /// + /// If the [`ScalarColumn`] has `length=0` the resulting `Series` will also have `length=0`. + pub fn as_single_value_series(&self) -> Series { + match self.materialized.get() { + Some(s) => s.head(Some(1)), + None => Self::_to_series( + self.name.clone(), + self.value.clone(), + usize::min(1, self.length), + ), + } + } + + /// Create a new [`ScalarColumn`] from a `length=1` Series and expand it `length`. + /// + /// This will panic if the value cannot be made static or if the series has length `0`. + pub fn from_single_value_series(series: Series, length: usize) -> PolarsResult { + debug_assert_eq!(series.len(), 1); + let value = series.get(0)?; + let value = value.into_static()?; + let value = Scalar::new(series.dtype().clone(), value); + Ok(ScalarColumn::new(series.name().clone(), value, length)) + } + + /// Resize the [`ScalarColumn`] to new `length`. + /// + /// This reuses the materialized [`Series`], if `length <= self.length`. pub fn resize(&self, length: usize) -> ScalarColumn { let mut sliced = Self { name: self.name.clone(), diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs index 12e3ee2f9d01..45300d80d319 100644 --- a/crates/polars-io/src/utils/other.rs +++ b/crates/polars-io/src/utils/other.rs @@ -87,7 +87,7 @@ pub(crate) fn update_row_counts(dfs: &mut [(DataFrame, IdxSize)], offset: IdxSiz let mut previous = dfs[0].1 + offset; for (df, n_read) in &mut dfs[1..] { if let Some(s) = unsafe { df.get_columns_mut() }.get_mut(0) { - *s = &*s + previous; + *s = (&*s + previous).unwrap(); } previous += *n_read; } @@ -103,7 +103,7 @@ pub(crate) fn update_row_counts2(dfs: &mut [DataFrame], offset: IdxSize) { for df in &mut dfs[1..] { let n_read = df.height() as IdxSize; if let Some(s) = unsafe { df.get_columns_mut() }.get_mut(0) { - *s = &*s + previous; + *s = (&*s + previous).unwrap(); } previous += n_read; } @@ -122,7 +122,7 @@ pub(crate) fn update_row_counts3(dfs: &mut [DataFrame], heights: &[IdxSize], off let n_read = heights[i]; if let Some(s) = unsafe { df.get_columns_mut() }.get_mut(0) { - *s = &*s + previous; + *s = (&*s + previous).unwrap(); } previous += n_read; diff --git a/crates/polars-lazy/src/tests/aggregations.rs b/crates/polars-lazy/src/tests/aggregations.rs index 6b2d8cb05da0..7bb21eb5bcd6 100644 --- a/crates/polars-lazy/src/tests/aggregations.rs +++ b/crates/polars-lazy/src/tests/aggregations.rs @@ -31,7 +31,7 @@ fn test_agg_exprs() -> PolarsResult<()> { .lazy() .group_by_stable([col("cars")]) .agg([(lit(1) - col("A")) - .map(|s| Ok(Some(&s * 2)), GetOutput::same_type()) + .map(|s| Ok(Some((&s * 2)?)), GetOutput::same_type()) .alias("foo")]) .collect()?; let ca = out.column("foo")?.list()?; diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index 4d482202cd67..ff4894b99857 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -88,7 +88,7 @@ fn test_lazy_udf() { let df = get_df(); let new = df .lazy() - .select([col("sepal_width").map(|s| Ok(Some(s * 200.0)), GetOutput::same_type())]) + .select([col("sepal_width").map(|s| Ok(Some((s * 200.0)?)), GetOutput::same_type())]) .collect() .unwrap(); assert_eq!( @@ -247,7 +247,7 @@ fn test_lazy_query_2() { let df = load_df(); let ldf = df .lazy() - .with_column(col("a").map(|s| Ok(Some(s * 2)), GetOutput::same_type())) + .with_column(col("a").map(|s| Ok(Some((s * 2)?)), GetOutput::same_type())) .filter(col("a").lt(lit(2))) .select([col("b"), col("a")]); diff --git a/crates/polars-ops/src/series/ops/duration.rs b/crates/polars-ops/src/series/ops/duration.rs index bed8c8d90119..bdfb17114459 100644 --- a/crates/polars-ops/src/series/ops/duration.rs +++ b/crates/polars-ops/src/series/ops/duration.rs @@ -35,7 +35,7 @@ pub fn impl_duration(s: &[Column], time_unit: TimeUnit) -> PolarsResult microseconds = (microseconds + (nanoseconds.wrapping_trunc_div_scalar(1_000)))?; } if !is_zero_scalar(&milliseconds) { - microseconds = (microseconds + (milliseconds * 1_000))?; + microseconds = (microseconds + (milliseconds * 1_000)?)?; } microseconds }, @@ -44,10 +44,10 @@ pub fn impl_duration(s: &[Column], time_unit: TimeUnit) -> PolarsResult nanoseconds = nanoseconds.new_from_index(0, max_len); } if !is_zero_scalar(µseconds) { - nanoseconds = (nanoseconds + (microseconds * 1_000))?; + nanoseconds = (nanoseconds + (microseconds * 1_000)?)?; } if !is_zero_scalar(&milliseconds) { - nanoseconds = (nanoseconds + (milliseconds * 1_000_000))?; + nanoseconds = (nanoseconds + (milliseconds * 1_000_000)?)?; } nanoseconds }, @@ -72,19 +72,19 @@ pub fn impl_duration(s: &[Column], time_unit: TimeUnit) -> PolarsResult TimeUnit::Milliseconds => MILLISECONDS, }; if !is_zero_scalar(&seconds) { - duration = (duration + seconds * multiplier)?; + duration = ((duration + seconds)? * multiplier)?; } if !is_zero_scalar(&minutes) { - duration = (duration + minutes * (multiplier * 60))?; + duration = ((duration + minutes)? * (multiplier * 60))?; } if !is_zero_scalar(&hours) { - duration = (duration + hours * (multiplier * 60 * 60))?; + duration = ((duration + hours)? * (multiplier * 60 * 60))?; } if !is_zero_scalar(&days) { - duration = (duration + days * (multiplier * SECONDS_IN_DAY))?; + duration = ((duration + days)? * (multiplier * SECONDS_IN_DAY))?; } if !is_zero_scalar(&weeks) { - duration = (duration + weeks * (multiplier * SECONDS_IN_DAY * 7))?; + duration = ((duration + weeks)? * (multiplier * SECONDS_IN_DAY * 7))?; } duration From 3b591a832dbeb88369ebd23ac0dafba5d0f50360 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 14:49:31 +0200 Subject: [PATCH 33/42] scalar-opt Column casting --- .../src/chunked_array/logical/datetime.rs | 4 +- .../src/chunked_array/ops/any_value.rs | 2 +- crates/polars-core/src/datatypes/any_value.rs | 139 ++++++++++-- crates/polars-core/src/fmt.rs | 38 ++-- crates/polars-core/src/frame/column/mod.rs | 214 +++++++++++++----- crates/polars-core/src/frame/mod.rs | 2 + crates/polars-plan/src/plans/lit.rs | 4 +- .../polars-plan/src/plans/python/pyarrow.rs | 4 +- .../polars-python/src/conversion/any_value.rs | 51 +++-- 9 files changed, 334 insertions(+), 124 deletions(-) diff --git a/crates/polars-core/src/chunked_array/logical/datetime.rs b/crates/polars-core/src/chunked_array/logical/datetime.rs index 1be7a1cf9747..fd99ac74ce0f 100644 --- a/crates/polars-core/src/chunked_array/logical/datetime.rs +++ b/crates/polars-core/src/chunked_array/logical/datetime.rs @@ -19,13 +19,13 @@ impl LogicalType for DatetimeChunked { fn get_any_value(&self, i: usize) -> PolarsResult> { self.0 .get_any_value(i) - .map(|av| av.as_datetime(self.time_unit(), self.time_zone())) + .map(|av| av.as_datetime(self.time_unit(), self.time_zone().as_ref())) } unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> { self.0 .get_any_value_unchecked(i) - .as_datetime(self.time_unit(), self.time_zone()) + .as_datetime(self.time_unit(), self.time_zone().as_ref()) } fn cast_with_options( diff --git a/crates/polars-core/src/chunked_array/ops/any_value.rs b/crates/polars-core/src/chunked_array/ops/any_value.rs index 2a50b24d9bbf..ee9843ab58ad 100644 --- a/crates/polars-core/src/chunked_array/ops/any_value.rs +++ b/crates/polars-core/src/chunked_array/ops/any_value.rs @@ -100,7 +100,7 @@ pub(crate) unsafe fn arr_to_any_value<'a>( DataType::Datetime(tu, tz) => { let arr = &*(arr as *const dyn Array as *const Int64Array); let v = arr.value_unchecked(idx); - AnyValue::Datetime(v, *tu, tz) + AnyValue::Datetime(v, *tu, tz.as_ref()) }, #[cfg(feature = "dtype-date")] DataType::Date => { diff --git a/crates/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs index f7461d370b87..9b92ac0990eb 100644 --- a/crates/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -62,8 +62,12 @@ pub enum AnyValue<'a> { /// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in nanoseconds (64 bits). #[cfg(feature = "dtype-datetime")] - Datetime(i64, TimeUnit, &'a Option), - // A 64-bit integer representing difference between date-times in [`TimeUnit`] + Datetime(i64, TimeUnit, Option<&'a TimeZone>), + /// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) + /// in nanoseconds (64 bits). + #[cfg(feature = "dtype-datetime")] + DatetimeOwned(i64, TimeUnit, Option>), + /// A 64-bit integer representing difference between date-times in [`TimeUnit`] #[cfg(feature = "dtype-duration")] Duration(i64, TimeUnit), /// A 64-bit time representing the elapsed time since midnight in nanoseconds @@ -73,8 +77,14 @@ pub enum AnyValue<'a> { // otherwise it is in the array pointer #[cfg(feature = "dtype-categorical")] Categorical(u32, &'a RevMapping, SyncPtr), + // If syncptr is_null the data is in the rev-map + // otherwise it is in the array pointer + #[cfg(feature = "dtype-categorical")] + CategoricalOwned(u32, Arc, SyncPtr), #[cfg(feature = "dtype-categorical")] Enum(u32, &'a RevMapping, SyncPtr), + #[cfg(feature = "dtype-categorical")] + EnumOwned(u32, Arc, SyncPtr), /// Nested type, contains arrays that are filled with one of the datatypes. List(Series), #[cfg(feature = "dtype-array")] @@ -392,13 +402,19 @@ impl<'a> AnyValue<'a> { #[cfg(feature = "dtype-time")] Time(_) => DataType::Time, #[cfg(feature = "dtype-datetime")] - Datetime(_, tu, tz) => DataType::Datetime(*tu, (*tz).clone()), + Datetime(_, tu, tz) => DataType::Datetime(*tu, (*tz).cloned()), + #[cfg(feature = "dtype-datetime")] + DatetimeOwned(_, tu, tz) => { + DataType::Datetime(*tu, tz.as_ref().map(|v| v.as_ref().clone())) + }, #[cfg(feature = "dtype-duration")] Duration(_, tu) => DataType::Duration(*tu), #[cfg(feature = "dtype-categorical")] - Categorical(_, _, _) => DataType::Categorical(None, Default::default()), + Categorical(_, _, _) | CategoricalOwned(_, _, _) => { + DataType::Categorical(None, Default::default()) + }, #[cfg(feature = "dtype-categorical")] - Enum(_, _, _) => DataType::Enum(None, Default::default()), + Enum(_, _, _) | EnumOwned(_, _, _) => DataType::Enum(None, Default::default()), List(s) => DataType::List(Box::new(s.dtype().clone())), #[cfg(feature = "dtype-array")] Array(s, size) => DataType::Array(Box::new(s.dtype().clone()), *size), @@ -434,7 +450,7 @@ impl<'a> AnyValue<'a> { #[cfg(feature = "dtype-date")] Date(v) => NumCast::from(*v), #[cfg(feature = "dtype-datetime")] - Datetime(v, _, _) => NumCast::from(*v), + Datetime(v, _, _) | DatetimeOwned(v, _, _) => NumCast::from(*v), #[cfg(feature = "dtype-time")] Time(v) => NumCast::from(*v), #[cfg(feature = "dtype-duration")] @@ -568,7 +584,7 @@ impl<'a> AnyValue<'a> { // to datetime #[cfg(feature = "dtype-datetime")] (av, DataType::Datetime(tu, tz)) if av.is_numeric() => { - AnyValue::Datetime(av.extract::()?, *tu, tz) + AnyValue::Datetime(av.extract::()?, *tu, tz.as_ref()) }, #[cfg(all(feature = "dtype-datetime", feature = "dtype-date"))] (AnyValue::Date(v), DataType::Datetime(tu, _)) => AnyValue::Datetime( @@ -578,10 +594,13 @@ impl<'a> AnyValue<'a> { TimeUnit::Milliseconds => (*v as i64) * MS_IN_DAY, }, *tu, - &None, + None, ), #[cfg(feature = "dtype-datetime")] - (AnyValue::Datetime(v, tu, _), DataType::Datetime(tu_r, tz_r)) => AnyValue::Datetime( + ( + AnyValue::Datetime(v, tu, _) | AnyValue::DatetimeOwned(v, tu, _), + DataType::Datetime(tu_r, tz_r), + ) => AnyValue::Datetime( match (tu, tu_r) { (TimeUnit::Nanoseconds, TimeUnit::Microseconds) => *v / 1_000i64, (TimeUnit::Nanoseconds, TimeUnit::Milliseconds) => *v / 1_000_000i64, @@ -592,28 +611,32 @@ impl<'a> AnyValue<'a> { _ => *v, }, *tu_r, - tz_r, + tz_r.as_ref(), ), // to date #[cfg(feature = "dtype-date")] (av, DataType::Date) if av.is_numeric() => AnyValue::Date(av.extract::()?), #[cfg(all(feature = "dtype-date", feature = "dtype-datetime"))] - (AnyValue::Datetime(v, tu, _), DataType::Date) => AnyValue::Date(match tu { - TimeUnit::Nanoseconds => *v / NS_IN_DAY, - TimeUnit::Microseconds => *v / US_IN_DAY, - TimeUnit::Milliseconds => *v / MS_IN_DAY, - } as i32), + (AnyValue::Datetime(v, tu, _) | AnyValue::DatetimeOwned(v, tu, _), DataType::Date) => { + AnyValue::Date(match tu { + TimeUnit::Nanoseconds => *v / NS_IN_DAY, + TimeUnit::Microseconds => *v / US_IN_DAY, + TimeUnit::Milliseconds => *v / MS_IN_DAY, + } as i32) + }, // to time #[cfg(feature = "dtype-time")] (av, DataType::Time) if av.is_numeric() => AnyValue::Time(av.extract::()?), #[cfg(all(feature = "dtype-time", feature = "dtype-datetime"))] - (AnyValue::Datetime(v, tu, _), DataType::Time) => AnyValue::Time(match tu { - TimeUnit::Nanoseconds => *v % NS_IN_DAY, - TimeUnit::Microseconds => (*v % US_IN_DAY) * 1_000i64, - TimeUnit::Milliseconds => (*v % MS_IN_DAY) * 1_000_000i64, - }), + (AnyValue::Datetime(v, tu, _) | AnyValue::DatetimeOwned(v, tu, _), DataType::Time) => { + AnyValue::Time(match tu { + TimeUnit::Nanoseconds => *v % NS_IN_DAY, + TimeUnit::Microseconds => (*v % US_IN_DAY) * 1_000i64, + TimeUnit::Milliseconds => (*v % MS_IN_DAY) * 1_000_000i64, + }) + }, // to duration #[cfg(feature = "dtype-duration")] @@ -719,6 +742,14 @@ impl<'a> AnyValue<'a> { unsafe { Cow::Borrowed(arr.deref_unchecked().value(*idx as usize)) } } }, + #[cfg(feature = "dtype-categorical")] + Self::CategoricalOwned(idx, rev, arr) | AnyValue::EnumOwned(idx, rev, arr) => { + if arr.is_null() { + Cow::Owned(rev.get(*idx).to_string()) + } else { + unsafe { Cow::Borrowed(arr.deref_unchecked().value(*idx as usize)) } + } + }, av => Cow::Owned(av.to_string()), } } @@ -776,6 +807,12 @@ impl AnyValue<'_> { tu.hash(state); tz.hash(state); }, + #[cfg(feature = "dtype-datetime")] + DatetimeOwned(v, tu, tz) => { + v.hash(state); + tu.hash(state); + tz.hash(state); + }, #[cfg(feature = "dtype-duration")] Duration(v, tz) => { v.hash(state); @@ -784,7 +821,10 @@ impl AnyValue<'_> { #[cfg(feature = "dtype-time")] Time(v) => v.hash(state), #[cfg(feature = "dtype-categorical")] - Categorical(v, _, _) | Enum(v, _, _) => v.hash(state), + Categorical(v, _, _) + | CategoricalOwned(v, _, _) + | Enum(v, _, _) + | EnumOwned(v, _, _) => v.hash(state), #[cfg(feature = "object")] Object(_) => {}, #[cfg(feature = "object")] @@ -841,7 +881,7 @@ impl<'a> AnyValue<'a> { } } #[cfg(feature = "dtype-datetime")] - pub(crate) fn as_datetime(&self, tu: TimeUnit, tz: &'a Option) -> AnyValue<'a> { + pub(crate) fn as_datetime(&self, tu: TimeUnit, tz: Option<&'a TimeZone>) -> AnyValue<'a> { match self { AnyValue::Int64(v) => AnyValue::Datetime(*v, tu, tz), AnyValue::Null => AnyValue::Null, @@ -904,6 +944,12 @@ impl<'a> AnyValue<'a> { match self { AnyValue::BinaryOwned(data) => AnyValue::Binary(data), AnyValue::StringOwned(data) => AnyValue::String(data.as_str()), + #[cfg(feature = "dtype-datetime")] + AnyValue::DatetimeOwned(v, tu, tz) => AnyValue::Datetime(*v, *tu, tz.as_ref().map(AsRef::as_ref)), + #[cfg(feature = "dtype-categorical")] + AnyValue::CategoricalOwned(v, rev, arr) => AnyValue::Categorical(*v, rev.as_ref(), *arr), + #[cfg(feature = "dtype-categorical")] + AnyValue::EnumOwned(v, rev, arr) => AnyValue::Enum(*v, rev.as_ref(), *arr), av => av.clone(), } } @@ -926,8 +972,14 @@ impl<'a> AnyValue<'a> { Boolean(v) => Boolean(v), Float32(v) => Float32(v), Float64(v) => Float64(v), + #[cfg(feature = "dtype-datetime")] + Datetime(v, tu, tz) => DatetimeOwned(v, tu, tz.map(|v| Arc::new(v.clone()))), + #[cfg(feature = "dtype-datetime")] + DatetimeOwned(v, tu, tz) => DatetimeOwned(v, tu, tz), #[cfg(feature = "dtype-date")] Date(v) => Date(v), + #[cfg(feature = "dtype-duration")] + Duration(v, tu) => Duration(v, tu), #[cfg(feature = "dtype-time")] Time(v) => Time(v), List(v) => List(v), @@ -958,8 +1010,14 @@ impl<'a> AnyValue<'a> { }, #[cfg(feature = "dtype-decimal")] Decimal(val, scale) => Decimal(val, scale), - #[allow(unreachable_patterns)] - dt => polars_bail!(ComputeError: "cannot get static any-value from {}", dt), + #[cfg(feature = "dtype-categorical")] + Categorical(v, rev, arr) => CategoricalOwned(v, Arc::new(rev.clone()), arr), + #[cfg(feature = "dtype-categorical")] + CategoricalOwned(v, rev, arr) => CategoricalOwned(v, rev, arr), + #[cfg(feature = "dtype-categorical")] + Enum(v, rev, arr) => EnumOwned(v, Arc::new(rev.clone()), arr), + #[cfg(feature = "dtype-categorical")] + EnumOwned(v, rev, arr) => EnumOwned(v, rev, arr), }; Ok(av) } @@ -978,6 +1036,15 @@ impl<'a> AnyValue<'a> { }; Some(s) }, + #[cfg(feature = "dtype-categorical")] + AnyValue::CategoricalOwned(idx, rev, arr) | AnyValue::EnumOwned(idx, rev, arr) => { + let s = if arr.is_null() { + rev.get(*idx) + } else { + unsafe { arr.deref_unchecked().value(*idx as usize) } + }; + Some(s) + }, _ => None, } } @@ -1010,6 +1077,18 @@ impl AnyValue<'_> { (l, BinaryOwned(r)) => *l == AnyValue::Binary(r.as_slice()), #[cfg(feature = "object")] (l, ObjectOwned(r)) => *l == AnyValue::Object(&*r.0), + #[cfg(feature = "dtype-datetime")] + (DatetimeOwned(lv, ltu, ltz), r) => Datetime(*lv, *ltu, ltz.as_ref().map(|v| v.as_ref())) == *r, + #[cfg(feature = "dtype-datetime")] + (l, DatetimeOwned(rv, rtu, rtz)) => *l == Datetime(*rv, *rtu, rtz.as_ref().map(|v| v.as_ref())), + #[cfg(feature = "dtype-categorical")] + (CategoricalOwned(lv, lrev, larr), r) => Categorical(*lv, lrev.as_ref(), *larr) == *r, + #[cfg(feature = "dtype-categorical")] + (l, CategoricalOwned(rv, rrev, rarr)) => *l == Categorical(*rv, rrev.as_ref(), *rarr), + #[cfg(feature = "dtype-categorical")] + (EnumOwned(lv, lrev, larr), r) => Enum(*lv, lrev.as_ref(), *larr) == *r, + #[cfg(feature = "dtype-categorical")] + (l, EnumOwned(rv, rrev, rarr)) => *l == Enum(*rv, rrev.as_ref(), *rarr), // Comparison with null. (Null, Null) => null_equal, @@ -1140,6 +1219,18 @@ impl PartialOrd for AnyValue<'_> { (l, BinaryOwned(r)) => l.partial_cmp(&AnyValue::Binary(r.as_slice())), #[cfg(feature = "object")] (l, ObjectOwned(r)) => l.partial_cmp(&AnyValue::Object(&*r.0)), + #[cfg(feature = "dtype-datetime")] + (DatetimeOwned(lv, ltu, ltz), r) => Datetime(*lv, *ltu, ltz.as_ref().map(|v| v.as_ref())).partial_cmp(r), + #[cfg(feature = "dtype-datetime")] + (l, DatetimeOwned(rv, rtu, rtz)) => l.partial_cmp(&Datetime(*rv, *rtu, rtz.as_ref().map(|v| v.as_ref()))), + #[cfg(feature = "dtype-categorical")] + (CategoricalOwned(lv, lrev, larr), r) => Categorical(*lv, lrev.as_ref(), *larr).partial_cmp(r), + #[cfg(feature = "dtype-categorical")] + (l, CategoricalOwned(rv, rrev, rarr)) => l.partial_cmp(&Categorical(*rv, rrev.as_ref(), *rarr)), + #[cfg(feature = "dtype-categorical")] + (EnumOwned(lv, lrev, larr), r) => Enum(*lv, lrev.as_ref(), *larr).partial_cmp(r), + #[cfg(feature = "dtype-categorical")] + (l, EnumOwned(rv, rrev, rarr)) => l.partial_cmp(&Enum(*rv, rrev.as_ref(), *rarr)), // Comparison with null. (Null, Null) => Some(Ordering::Equal), diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index c8ab72d02031..4704ce1813cc 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -907,6 +907,23 @@ fn fmt_float(f: &mut Formatter<'_>, width: usize, v: T) -> fmt } } +fn fmt_datetime( + f: &mut Formatter<'_>, + v: i64, + tu: TimeUnit, + tz: Option<&self::datatypes::TimeZone>, +) -> fmt::Result { + let ndt = match tu { + TimeUnit::Nanoseconds => timestamp_ns_to_datetime(v), + TimeUnit::Microseconds => timestamp_us_to_datetime(v), + TimeUnit::Milliseconds => timestamp_ms_to_datetime(v), + }; + match tz { + None => std::fmt::Display::fmt(&ndt, f), + Some(tz) => PlTzAware::new(ndt, tz).fmt(f), + } +} + #[cfg(feature = "dtype-duration")] const NAMES: [&str; 4] = ["d", "h", "m", "s"]; #[cfg(feature = "dtype-duration")] @@ -1023,19 +1040,9 @@ impl Display for AnyValue<'_> { #[cfg(feature = "dtype-date")] AnyValue::Date(v) => write!(f, "{}", date32_to_date(*v)), #[cfg(feature = "dtype-datetime")] - AnyValue::Datetime(v, tu, tz) => { - let ndt = match tu { - TimeUnit::Nanoseconds => timestamp_ns_to_datetime(*v), - TimeUnit::Microseconds => timestamp_us_to_datetime(*v), - TimeUnit::Milliseconds => timestamp_ms_to_datetime(*v), - }; - match tz { - None => write!(f, "{ndt}"), - Some(tz) => { - write!(f, "{}", PlTzAware::new(ndt, tz)) - }, - } - }, + AnyValue::Datetime(v, tu, tz) => fmt_datetime(f, *v, *tu, *tz), + #[cfg(feature = "dtype-datetime")] + AnyValue::DatetimeOwned(v, tu, tz) => fmt_datetime(f, *v, *tu, tz.as_ref().map(|v| v.as_ref())), #[cfg(feature = "dtype-duration")] AnyValue::Duration(v, tu) => match tu { TimeUnit::Nanoseconds => fmt_duration_ns(f, *v), @@ -1048,7 +1055,10 @@ impl Display for AnyValue<'_> { write!(f, "{nt}") }, #[cfg(feature = "dtype-categorical")] - AnyValue::Categorical(_, _, _) | AnyValue::Enum(_, _, _) => { + AnyValue::Categorical(_, _, _) + | AnyValue::CategoricalOwned(_, _, _) + | AnyValue::Enum(_, _, _) + | AnyValue::EnumOwned(_, _, _) => { let s = self.get_str().unwrap(); write!(f, "\"{s}\"") }, diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index 05de2185956b..e2badd50b825 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -7,6 +7,7 @@ use polars_utils::index::check_bounds; use polars_utils::pl_str::PlSmallStr; use self::gather::check_bounds_ca; +use crate::chunked_array::cast::CastOptions; use crate::chunked_array::metadata::MetadataFlags; use crate::prelude::*; use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; @@ -39,7 +40,8 @@ pub enum Column { #[derive(Debug, Clone)] pub struct ScalarColumn { name: PlSmallStr, - value: Scalar, + // The value of this scalar may be incoherent when `length == 0`. + scalar: Scalar, length: usize, // invariants: @@ -72,8 +74,8 @@ impl Column { } #[inline] - pub fn new_scalar(name: PlSmallStr, value: Scalar, length: usize) -> Self { - Self::Scalar(ScalarColumn::new(name, value, length)) + pub fn new_scalar(name: PlSmallStr, scalar: Scalar, length: usize) -> Self { + Self::Scalar(ScalarColumn::new(name, scalar, length)) } // # Materialize @@ -119,7 +121,7 @@ impl Column { pub fn dtype(&self) -> &DataType { match self { Column::Series(s) => s.dtype(), - Column::Scalar(s) => s.value.dtype(), + Column::Scalar(s) => s.scalar.dtype(), } } @@ -128,7 +130,7 @@ impl Column { match self { Column::Series(s) => s.field(), Column::Scalar(s) => match s.materialized.get() { - None => Cow::Owned(Field::new(s.name.clone(), s.value.dtype().clone())), + None => Cow::Owned(Field::new(s.name.clone(), s.scalar.dtype().clone())), Some(s) => s.field(), }, } @@ -188,87 +190,67 @@ impl Column { // # To Chunked Arrays pub fn bool(&self) -> PolarsResult<&BooleanChunked> { - // @scalar-opt self.as_materialized_series().bool() } pub fn i8(&self) -> PolarsResult<&Int8Chunked> { - // @scalar-opt self.as_materialized_series().i8() } pub fn i16(&self) -> PolarsResult<&Int16Chunked> { - // @scalar-opt self.as_materialized_series().i16() } pub fn i32(&self) -> PolarsResult<&Int32Chunked> { - // @scalar-opt self.as_materialized_series().i32() } pub fn i64(&self) -> PolarsResult<&Int64Chunked> { - // @scalar-opt self.as_materialized_series().i64() } pub fn u8(&self) -> PolarsResult<&UInt8Chunked> { - // @scalar-opt self.as_materialized_series().u8() } pub fn u16(&self) -> PolarsResult<&UInt16Chunked> { - // @scalar-opt self.as_materialized_series().u16() } pub fn u32(&self) -> PolarsResult<&UInt32Chunked> { - // @scalar-opt self.as_materialized_series().u32() } pub fn u64(&self) -> PolarsResult<&UInt64Chunked> { - // @scalar-opt self.as_materialized_series().u64() } pub fn f32(&self) -> PolarsResult<&Float32Chunked> { - // @scalar-opt self.as_materialized_series().f32() } pub fn f64(&self) -> PolarsResult<&Float64Chunked> { - // @scalar-opt self.as_materialized_series().f64() } pub fn str(&self) -> PolarsResult<&StringChunked> { - // @scalar-opt self.as_materialized_series().str() } pub fn list(&self) -> PolarsResult<&ListChunked> { - // @scalar-opt self.as_materialized_series().list() } pub fn binary(&self) -> PolarsResult<&BinaryChunked> { - // @scalar-opt self.as_materialized_series().binary() } pub fn idx(&self) -> PolarsResult<&IdxCa> { - // @scalar-opt self.as_materialized_series().idx() } pub fn binary_offset(&self) -> PolarsResult<&BinaryOffsetChunked> { - // @scalar-opt self.as_materialized_series().binary_offset() } #[cfg(feature = "dtype-datetime")] pub fn datetime(&self) -> PolarsResult<&DatetimeChunked> { - // @scalar-opt self.as_materialized_series().datetime() } #[cfg(feature = "dtype-struct")] pub fn struct_(&self) -> PolarsResult<&StructChunked> { - // @scalar-opt self.as_materialized_series().struct_() } #[cfg(feature = "dtype-decimal")] pub fn decimal(&self) -> PolarsResult<&DecimalChunked> { - // @scalar-opt self.as_materialized_series().decimal() } #[cfg(feature = "dtype-array")] pub fn array(&self) -> PolarsResult<&ArrayChunked> { - // @scalar-opt self.as_materialized_series().array() } #[cfg(feature = "dtype-categorical")] @@ -277,38 +259,46 @@ impl Column { } #[cfg(feature = "dtype-date")] pub fn date(&self) -> PolarsResult<&DateChunked> { - // @scalar-opt self.as_materialized_series().date() } #[cfg(feature = "dtype-duration")] pub fn duration(&self) -> PolarsResult<&DurationChunked> { - // @scalar-opt self.as_materialized_series().duration() } // # Casting + pub fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult { + match self { + Column::Series(s) => s.cast_with_options(dtype, options).map(Column::from), + Column::Scalar(s) => s.cast_with_options(dtype, options).map(Column::from), + } + } pub fn strict_cast(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .strict_cast(dtype) - .map(Column::from) + match self { + Column::Series(s) => s.strict_cast(dtype).map(Column::from), + Column::Scalar(s) => s.strict_cast(dtype).map(Column::from), + } } pub fn cast(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().cast(dtype).map(Column::from) + match self { + Column::Series(s) => s.cast(dtype).map(Column::from), + Column::Scalar(s) => s.cast(dtype).map(Column::from), + } } /// # Safety /// /// This can lead to invalid memory access in downstream code. pub unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - unsafe { self.as_materialized_series().cast_unchecked(dtype) }.map(Column::from) + match self { + Column::Series(s) => unsafe { s.cast_unchecked(dtype) }.map(Column::from), + Column::Scalar(s) => unsafe { s.cast_unchecked(dtype) }.map(Column::from), + } } pub fn clear(&self) -> Self { match self { Column::Series(s) => s.clear().into(), - Column::Scalar(s) => Self::new_scalar(s.name.clone(), s.value.clone(), 0), + Column::Scalar(s) => Self::new_scalar(s.name.clone(), s.scalar.clone(), 0), } } @@ -326,7 +316,7 @@ impl Column { Column::Series(s) => s.new_from_index(index, length).into(), Column::Scalar(s) => { if index >= s.length { - Self::full_null(s.name.clone(), length, s.value.dtype()) + Self::full_null(s.name.clone(), length, s.scalar.dtype()) } else { s.resize(length).into() } @@ -346,14 +336,14 @@ impl Column { pub fn is_null(&self) -> BooleanChunked { match self { Self::Series(s) => s.is_null(), - Self::Scalar(s) => BooleanChunked::full(s.name.clone(), s.value.is_null(), s.length), + Self::Scalar(s) => BooleanChunked::full(s.name.clone(), s.scalar.is_null(), s.length), } } #[inline] pub fn is_not_null(&self) -> BooleanChunked { match self { Self::Series(s) => s.is_not_null(), - Self::Scalar(s) => BooleanChunked::full(s.name.clone(), !s.value.is_null(), s.length), + Self::Scalar(s) => BooleanChunked::full(s.name.clone(), !s.scalar.is_null(), s.length), } } @@ -376,9 +366,6 @@ impl Column { debug_assert!(len <= i64::MAX as usize); self.slice(-(len as i64), len) } - /// # Safety - /// - /// No bounds checks are performed on offset and length pub fn slice(&self, offset: i64, length: usize) -> Column { match self { Column::Series(s) => s.slice(offset, length).into(), @@ -399,7 +386,7 @@ impl Column { pub fn null_count(&self) -> usize { match self { Self::Series(s) => s.null_count(), - Self::Scalar(s) if s.value.is_null() => s.length, + Self::Scalar(s) if s.scalar.is_null() => s.length, Self::Scalar(_) => 0, } } @@ -700,6 +687,14 @@ impl Column { self.as_materialized_series().shift(periods).into() } + #[cfg(feature = "zip_with")] + pub fn zip_with(&self, mask: &BooleanChunked, other: &Self) -> PolarsResult { + // @scalar-opt + self.as_materialized_series() + .zip_with(mask, other.as_materialized_series()) + .map(Self::from) + } + #[cfg(feature = "zip_with")] pub fn zip_with_same_type( &self, @@ -798,14 +793,6 @@ impl Column { .map(Self::from) } - #[cfg(feature = "zip_with")] - pub fn zip_with(&self, mask: &BooleanChunked, other: &Self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .zip_with(mask, other.as_materialized_series()) - .map(Self::from) - } - pub fn is_finite(&self) -> PolarsResult { // @scalar-opt self.as_materialized_series().is_finite() @@ -862,7 +849,7 @@ impl Column { match self { Column::Series(s) => s.get_unchecked(index), - Column::Scalar(s) => s.value.as_any_value(), + Column::Scalar(s) => s.scalar.as_any_value(), } } @@ -947,8 +934,11 @@ impl ChunkCompare<&Column> for Column { impl Default for Column { fn default() -> Self { - // @scalar-opt - Column::Series(Series::default()) + Self::new_scalar( + PlSmallStr::EMPTY, + Scalar::new(DataType::Int64, AnyValue::Null), + 0, + ) } } @@ -988,16 +978,27 @@ impl From for Column { impl ScalarColumn { #[inline] - pub fn new(name: PlSmallStr, value: Scalar, length: usize) -> Self { + pub fn new(name: PlSmallStr, scalar: Scalar, length: usize) -> Self { Self { name, - value, + scalar, length, materialized: OnceLock::new(), } } + #[inline] + pub fn new_empty(name: PlSmallStr, dtype: DataType) -> Self { + Self { + name, + scalar: Scalar::new(dtype, AnyValue::Null), + length: 0, + + materialized: OnceLock::new(), + } + } + fn _to_series(name: PlSmallStr, value: Scalar, length: usize) -> Series { let series = if length == 0 { Series::new_empty(name, value.dtype()) @@ -1012,7 +1013,7 @@ impl ScalarColumn { /// Materialize the [`ScalarColumn`] into a [`Series`]. pub fn to_series(&self) -> Series { - Self::_to_series(self.name.clone(), self.value.clone(), self.length) + Self::_to_series(self.name.clone(), self.scalar.clone(), self.length) } /// Get the [`ScalarColumn`] as [`Series`] @@ -1026,7 +1027,7 @@ impl ScalarColumn { pub fn take_materialized_series(self) -> Series { self.materialized .into_inner() - .unwrap_or_else(|| Self::_to_series(self.name, self.value, self.length)) + .unwrap_or_else(|| Self::_to_series(self.name, self.scalar, self.length)) } /// Take the [`ScalarColumn`] as a series with a single value. @@ -1037,7 +1038,7 @@ impl ScalarColumn { Some(s) => s.head(Some(1)), None => Self::_to_series( self.name.clone(), - self.value.clone(), + self.scalar.clone(), usize::min(1, self.length), ), } @@ -1057,10 +1058,16 @@ impl ScalarColumn { /// Resize the [`ScalarColumn`] to new `length`. /// /// This reuses the materialized [`Series`], if `length <= self.length`. + /// + /// # Panics + /// + /// This panics if `self.length == 0`. pub fn resize(&self, length: usize) -> ScalarColumn { + assert_ne!(self.length, 0); + let mut sliced = Self { name: self.name.clone(), - value: self.value.clone(), + scalar: self.scalar.clone(), length, materialized: OnceLock::new(), }; @@ -1075,8 +1082,93 @@ impl ScalarColumn { sliced } + pub fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult { + // @NOTE: We expect that when casting the materialized series mostly does not need change + // the physical array. Therefore, we try to cast the entire materialized array if it is + // available. + + match self.materialized.get() { + Some(s) => { + let materialized = s.cast_with_options(dtype, options)?; + assert_eq!(self.length, materialized.len()); + + let mut casted = if materialized.len() == 0 { + Self::new_empty(materialized.name().clone(), materialized.dtype().clone()) + } else { + // SAFETY: Just did bounds check + let scalar = unsafe { materialized.get_unchecked(0) }.into_static()?; + Self::new( + materialized.name().clone(), + Scalar::new(materialized.dtype().clone(), scalar), + self.length, + ) + }; + casted.materialized = OnceLock::from(materialized); + Ok(casted) + }, + None => { + let s = self + .as_single_value_series() + .cast_with_options(dtype, options)?; + assert_eq!(1, s.len()); + + if self.length == 0 { + Ok(Self::new_empty(s.name().clone(), s.dtype().clone())) + } else { + Self::from_single_value_series(s, self.length) + } + }, + } + } + + pub fn strict_cast(&self, dtype: &DataType) -> PolarsResult { + self.cast_with_options(dtype, CastOptions::Strict) + } + pub fn cast(&self, dtype: &DataType) -> PolarsResult { + self.cast_with_options(dtype, CastOptions::NonStrict) + } + /// # Safety + /// + /// This can lead to invalid memory access in downstream code. + pub unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult { + // @NOTE: We expect that when casting the materialized series mostly does not need change + // the physical array. Therefore, we try to cast the entire materialized array if it is + // available. + + match self.materialized.get() { + Some(s) => { + let materialized = s.cast_unchecked(dtype)?; + assert_eq!(self.length, materialized.len()); + + let mut casted = if materialized.len() == 0 { + Self::new_empty(materialized.name().clone(), materialized.dtype().clone()) + } else { + // SAFETY: Just did bounds check + let scalar = unsafe { materialized.get_unchecked(0) }.into_static()?; + Self::new( + materialized.name().clone(), + Scalar::new(materialized.dtype().clone(), scalar), + self.length, + ) + }; + casted.materialized = OnceLock::from(materialized); + Ok(casted) + }, + None => { + let s = self.as_single_value_series().cast_unchecked(dtype)?; + assert_eq!(1, s.len()); + + if self.length == 0 { + Ok(Self::new_empty(s.name().clone(), s.dtype().clone())) + } else { + Self::from_single_value_series(s, self.length) + } + }, + } + } + pub fn has_nulls(&self) -> bool { - self.length != 0 && self.value.is_null() + self.length != 0 && self.scalar.is_null() } } diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 567b2c046a34..1af40d9fbbbe 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -174,10 +174,12 @@ pub struct DataFrame { } impl DataFrame { + #[inline] pub fn materialized_column_iter(&self) -> impl ExactSizeIterator { self.columns.iter().map(Column::as_materialized_series) } + #[inline] pub fn par_materialized_column_iter(&self) -> impl ParallelIterator { self.columns.par_iter().map(Column::as_materialized_series) } diff --git a/crates/polars-plan/src/plans/lit.rs b/crates/polars-plan/src/plans/lit.rs index b48896ae26d4..6d95d7c443ca 100644 --- a/crates/polars-plan/src/plans/lit.rs +++ b/crates/polars-plan/src/plans/lit.rs @@ -133,7 +133,7 @@ impl LiteralValue { #[cfg(feature = "dtype-date")] Date(v) => AnyValue::Date(*v), #[cfg(feature = "dtype-datetime")] - DateTime(v, tu, tz) => AnyValue::Datetime(*v, *tu, tz), + DateTime(v, tu, tz) => AnyValue::Datetime(*v, *tu, tz.as_ref()), #[cfg(feature = "dtype-time")] Time(v) => AnyValue::Time(*v), Series(_) => return None, @@ -311,7 +311,7 @@ impl TryFrom> for LiteralValue { #[cfg(feature = "dtype-date")] AnyValue::Date(v) => Ok(LiteralValue::Date(v)), #[cfg(feature = "dtype-datetime")] - AnyValue::Datetime(value, tu, tz) => Ok(LiteralValue::DateTime(value, tu, tz.clone())), + AnyValue::Datetime(value, tu, tz) => Ok(LiteralValue::DateTime(value, tu, tz.cloned())), #[cfg(feature = "dtype-duration")] AnyValue::Duration(value, tu) => Ok(LiteralValue::Duration(value, tu)), #[cfg(feature = "dtype-time")] diff --git a/crates/polars-plan/src/plans/python/pyarrow.rs b/crates/polars-plan/src/plans/python/pyarrow.rs index abd018b3a4e6..20b800fa81b1 100644 --- a/crates/polars-plan/src/plans/python/pyarrow.rs +++ b/crates/polars-plan/src/plans/python/pyarrow.rs @@ -49,7 +49,7 @@ pub fn predicate_to_pa( let s = if v { "True" } else { "False" }; write!(list_repr, "{},", s).unwrap(); } else if let AnyValue::Datetime(v, tu, tz) = av { - let dtm = to_py_datetime(v, &tu, tz.as_ref()); + let dtm = to_py_datetime(v, &tu, tz); write!(list_repr, "{dtm},").unwrap(); } else if let AnyValue::Date(v) = av { write!(list_repr, "to_py_date({v}),").unwrap(); @@ -83,7 +83,7 @@ pub fn predicate_to_pa( Some(format!("to_py_date({v})")) }, #[cfg(feature = "dtype-datetime")] - AnyValue::Datetime(v, tu, tz) => Some(to_py_datetime(v, &tu, tz.as_ref())), + AnyValue::Datetime(v, tu, tz) => Some(to_py_datetime(v, &tu, tz)), // Activate once pyarrow supports them // #[cfg(feature = "dtype-time")] // AnyValue::Time(v) => { diff --git a/crates/polars-python/src/conversion/any_value.rs b/crates/polars-python/src/conversion/any_value.rs index 3141d02799fb..034b8169cebf 100644 --- a/crates/polars-python/src/conversion/any_value.rs +++ b/crates/polars-python/src/conversion/any_value.rs @@ -5,7 +5,7 @@ use polars::chunked_array::object::PolarsObjectSafe; #[cfg(feature = "object")] use polars::datatypes::OwnedObject; use polars::datatypes::{DataType, Field, PlHashMap, TimeUnit}; -use polars::prelude::{AnyValue, PlSmallStr, Series}; +use polars::prelude::{AnyValue, PlSmallStr, Series, TimeZone}; use polars_core::export::chrono::{NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike}; use polars_core::utils::any_values_to_supertype_and_n_dtypes; use polars_core::utils::arrow::temporal_conversions::date32_to_date; @@ -65,27 +65,23 @@ pub(crate) fn any_value_into_py_object(av: AnyValue, py: Python) -> PyObject { }; s.into_py(py) }, + AnyValue::CategoricalOwned(idx, rev, arr) | AnyValue::EnumOwned(idx, rev, arr) => { + let s = if arr.is_null() { + rev.get(idx) + } else { + unsafe { arr.deref_unchecked().value(idx as usize) } + }; + s.into_py(py) + }, AnyValue::Date(v) => { let date = date32_to_date(v); date.into_py(py) }, AnyValue::Datetime(v, time_unit, time_zone) => { - if let Some(time_zone) = time_zone { - // When https://github.com/pola-rs/polars/issues/16199 is - // implemented, we'll switch to something like: - // - // let tz: chrono_tz::Tz = time_zone.parse().unwrap(); - // let datetime = tz.from_local_datetime(&naive_datetime).earliest().unwrap(); - // datetime.into_py(py) - let convert = utils.getattr(intern!(py, "to_py_datetime")).unwrap(); - let time_unit = time_unit.to_ascii(); - convert - .call1((v, time_unit, time_zone.as_str())) - .unwrap() - .into_py(py) - } else { - timestamp_to_naive_datetime(v, time_unit).into_py(py) - } + datetime_to_py_object(py, utils, v, time_unit, time_zone) + }, + AnyValue::DatetimeOwned(v, time_unit, time_zone) => { + datetime_to_py_object(py, utils, v, time_unit, time_zone.as_ref().map(AsRef::as_ref)) }, AnyValue::Duration(v, time_unit) => { let time_delta = elapsed_offset_to_timedelta(v, time_unit); @@ -127,6 +123,25 @@ pub(crate) fn any_value_into_py_object(av: AnyValue, py: Python) -> PyObject { } } +fn datetime_to_py_object(py: Python, utils: &Bound, v: i64, tu: TimeUnit, tz: Option<&TimeZone>) -> PyObject { + if let Some(time_zone) = tz { + // When https://github.com/pola-rs/polars/issues/16199 is + // implemented, we'll switch to something like: + // + // let tz: chrono_tz::Tz = time_zone.parse().unwrap(); + // let datetime = tz.from_local_datetime(&naive_datetime).earliest().unwrap(); + // datetime.into_py(py) + let convert = utils.getattr(intern!(py, "to_py_datetime")).unwrap(); + let time_unit = tu.to_ascii(); + convert + .call1((v, time_unit, time_zone.as_str())) + .unwrap() + .into_py(py) + } else { + timestamp_to_naive_datetime(v, tu).into_py(py) + } +} + type TypeObjectPtr = usize; type InitFn = for<'py> fn(&Bound<'py, PyAny>, bool) -> PyResult>; pub(crate) static LUT: crate::gil_once_cell::GILOnceCell> = @@ -204,7 +219,7 @@ pub(crate) fn py_object_to_any_value<'py>( .call1((ob, intern!(py, "us"))) .unwrap(); let v = date.extract::()?; - Ok(AnyValue::Datetime(v, TimeUnit::Microseconds, &None)) + Ok(AnyValue::Datetime(v, TimeUnit::Microseconds, None)) }) } From 16d1d632cbd0efb94ee388289dab6ca1a9c5c1cc Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 16:24:33 +0200 Subject: [PATCH 34/42] further scalar-opt removals --- crates/polars-core/src/datatypes/any_value.rs | 32 +++++++--- crates/polars-core/src/fmt.rs | 4 +- crates/polars-core/src/frame/column/mod.rs | 61 +++++++++++++++++-- .../src/dsl/function_expr/shrink_type.rs | 38 ++++++------ 4 files changed, 101 insertions(+), 34 deletions(-) diff --git a/crates/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs index 9b92ac0990eb..05ad0647dbcf 100644 --- a/crates/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -945,9 +945,13 @@ impl<'a> AnyValue<'a> { AnyValue::BinaryOwned(data) => AnyValue::Binary(data), AnyValue::StringOwned(data) => AnyValue::String(data.as_str()), #[cfg(feature = "dtype-datetime")] - AnyValue::DatetimeOwned(v, tu, tz) => AnyValue::Datetime(*v, *tu, tz.as_ref().map(AsRef::as_ref)), + AnyValue::DatetimeOwned(v, tu, tz) => { + AnyValue::Datetime(*v, *tu, tz.as_ref().map(AsRef::as_ref)) + }, #[cfg(feature = "dtype-categorical")] - AnyValue::CategoricalOwned(v, rev, arr) => AnyValue::Categorical(*v, rev.as_ref(), *arr), + AnyValue::CategoricalOwned(v, rev, arr) => { + AnyValue::Categorical(*v, rev.as_ref(), *arr) + }, #[cfg(feature = "dtype-categorical")] AnyValue::EnumOwned(v, rev, arr) => AnyValue::Enum(*v, rev.as_ref(), *arr), av => av.clone(), @@ -1078,9 +1082,13 @@ impl AnyValue<'_> { #[cfg(feature = "object")] (l, ObjectOwned(r)) => *l == AnyValue::Object(&*r.0), #[cfg(feature = "dtype-datetime")] - (DatetimeOwned(lv, ltu, ltz), r) => Datetime(*lv, *ltu, ltz.as_ref().map(|v| v.as_ref())) == *r, + (DatetimeOwned(lv, ltu, ltz), r) => { + Datetime(*lv, *ltu, ltz.as_ref().map(|v| v.as_ref())) == *r + }, #[cfg(feature = "dtype-datetime")] - (l, DatetimeOwned(rv, rtu, rtz)) => *l == Datetime(*rv, *rtu, rtz.as_ref().map(|v| v.as_ref())), + (l, DatetimeOwned(rv, rtu, rtz)) => { + *l == Datetime(*rv, *rtu, rtz.as_ref().map(|v| v.as_ref())) + }, #[cfg(feature = "dtype-categorical")] (CategoricalOwned(lv, lrev, larr), r) => Categorical(*lv, lrev.as_ref(), *larr) == *r, #[cfg(feature = "dtype-categorical")] @@ -1220,13 +1228,21 @@ impl PartialOrd for AnyValue<'_> { #[cfg(feature = "object")] (l, ObjectOwned(r)) => l.partial_cmp(&AnyValue::Object(&*r.0)), #[cfg(feature = "dtype-datetime")] - (DatetimeOwned(lv, ltu, ltz), r) => Datetime(*lv, *ltu, ltz.as_ref().map(|v| v.as_ref())).partial_cmp(r), + (DatetimeOwned(lv, ltu, ltz), r) => { + Datetime(*lv, *ltu, ltz.as_ref().map(|v| v.as_ref())).partial_cmp(r) + }, #[cfg(feature = "dtype-datetime")] - (l, DatetimeOwned(rv, rtu, rtz)) => l.partial_cmp(&Datetime(*rv, *rtu, rtz.as_ref().map(|v| v.as_ref()))), + (l, DatetimeOwned(rv, rtu, rtz)) => { + l.partial_cmp(&Datetime(*rv, *rtu, rtz.as_ref().map(|v| v.as_ref()))) + }, #[cfg(feature = "dtype-categorical")] - (CategoricalOwned(lv, lrev, larr), r) => Categorical(*lv, lrev.as_ref(), *larr).partial_cmp(r), + (CategoricalOwned(lv, lrev, larr), r) => { + Categorical(*lv, lrev.as_ref(), *larr).partial_cmp(r) + }, #[cfg(feature = "dtype-categorical")] - (l, CategoricalOwned(rv, rrev, rarr)) => l.partial_cmp(&Categorical(*rv, rrev.as_ref(), *rarr)), + (l, CategoricalOwned(rv, rrev, rarr)) => { + l.partial_cmp(&Categorical(*rv, rrev.as_ref(), *rarr)) + }, #[cfg(feature = "dtype-categorical")] (EnumOwned(lv, lrev, larr), r) => Enum(*lv, lrev.as_ref(), *larr).partial_cmp(r), #[cfg(feature = "dtype-categorical")] diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 4704ce1813cc..9e60fb2b9800 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -1042,7 +1042,9 @@ impl Display for AnyValue<'_> { #[cfg(feature = "dtype-datetime")] AnyValue::Datetime(v, tu, tz) => fmt_datetime(f, *v, *tu, *tz), #[cfg(feature = "dtype-datetime")] - AnyValue::DatetimeOwned(v, tu, tz) => fmt_datetime(f, *v, *tu, tz.as_ref().map(|v| v.as_ref())), + AnyValue::DatetimeOwned(v, tu, tz) => { + fmt_datetime(f, *v, *tu, tz.as_ref().map(|v| v.as_ref())) + }, #[cfg(feature = "dtype-duration")] AnyValue::Duration(v, tu) => match tu { TimeUnit::Nanoseconds => fmt_duration_ns(f, *v), diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index e2badd50b825..8bfe6140d663 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -787,10 +787,20 @@ impl Column { } pub fn extend_constant(&self, value: AnyValue, n: usize) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .extend_constant(value, n) - .map(Self::from) + self.as_materialized_series().extend_constant(value, n).map(Column::from) + // @scalar-opt: This currently fails because Scalar::partial_cmp cannot deal with Nulls + // + // match self { + // Column::Series(s) => s.extend_constant(value, n).map(Column::from), + // Column::Scalar(s) => { + // if s.scalar.as_any_value() == value && s.len() > 0 { + // Ok(s.resize(s.len() + n).into()) + // } else { + // // @scalar-opt + // s.as_materialized_series().extend_constant(value, n).map(Column::from) + // } + // }, + // } } pub fn is_finite(&self) -> PolarsResult { @@ -870,6 +880,33 @@ impl Column { pub(crate) fn str_value(&self, index: usize) -> PolarsResult> { Ok(self.get(index)?.str_value()) } + + pub fn max_reduce(&self) -> PolarsResult { + match self { + Column::Series(s) => s.max_reduce(), + Column::Scalar(s) => { + // We don't really want to deal with handling the full semantics here so we just + // cast to a single value series. This is a tiny bit wasteful, but probably fine. + s.as_single_value_series().max_reduce() + }, + } + } + + pub fn min_reduce(&self) -> PolarsResult { + match self { + Column::Series(s) => s.min_reduce(), + Column::Scalar(s) => { + // We don't really want to deal with handling the full semantics here so we just + // cast to a single value series. This is a tiny bit wasteful, but probably fine. + s.as_single_value_series().min_reduce() + }, + } + } + + pub(crate) fn estimated_size(&self) -> usize { + // @scalar-opt + self.as_materialized_series().estimated_size() + } } impl ChunkCompare<&Column> for Column { @@ -999,6 +1036,22 @@ impl ScalarColumn { } } + pub fn name(&self) -> &PlSmallStr { + &self.name + } + + pub fn dtype(&self) -> &DataType { + self.scalar.dtype() + } + + pub fn len(&self) -> usize { + self.length + } + + pub fn is_empty(&self) -> bool { + self.length == 0 + } + fn _to_series(name: PlSmallStr, value: Scalar, length: usize) -> Series { let series = if length == 0 { Series::new_empty(name, value.dtype()) diff --git a/crates/polars-plan/src/dsl/function_expr/shrink_type.rs b/crates/polars-plan/src/dsl/function_expr/shrink_type.rs index 99dbb97cc67c..224691e98ef4 100644 --- a/crates/polars-plan/src/dsl/function_expr/shrink_type.rs +++ b/crates/polars-plan/src/dsl/function_expr/shrink_type.rs @@ -1,42 +1,38 @@ use super::*; -pub(super) fn shrink(s: Column) -> PolarsResult { - if !s.dtype().is_numeric() { - return Ok(s); +pub(super) fn shrink(c: Column) -> PolarsResult { + if !c.dtype().is_numeric() { + return Ok(c); } - if s.dtype().is_float() { - return s.cast(&DataType::Float32); + if c.dtype().is_float() { + return c.cast(&DataType::Float32); } - // @scalar-opt - let s = s.as_materialized_series(); - - if s.dtype().is_unsigned_integer() { - let max = s.max_reduce()?.value().extract::().unwrap_or(0_u64); + if c.dtype().is_unsigned_integer() { + let max = c.max_reduce()?.value().extract::().unwrap_or(0_u64); if cfg!(feature = "dtype-u8") && max <= u8::MAX as u64 { - s.cast(&DataType::UInt8) + c.cast(&DataType::UInt8) } else if cfg!(feature = "dtype-u16") && max <= u16::MAX as u64 { - s.cast(&DataType::UInt16) + c.cast(&DataType::UInt16) } else if max <= u32::MAX as u64 { - s.cast(&DataType::UInt32) + c.cast(&DataType::UInt32) } else { - Ok(s.clone()) + Ok(c) } } else { - let min = s.min_reduce()?.value().extract::().unwrap_or(0_i64); - let max = s.max_reduce()?.value().extract::().unwrap_or(0_i64); + let min = c.min_reduce()?.value().extract::().unwrap_or(0_i64); + let max = c.max_reduce()?.value().extract::().unwrap_or(0_i64); if cfg!(feature = "dtype-i8") && min >= i8::MIN as i64 && max <= i8::MAX as i64 { - s.cast(&DataType::Int8) + c.cast(&DataType::Int8) } else if cfg!(feature = "dtype-i16") && min >= i16::MIN as i64 && max <= i16::MAX as i64 { - s.cast(&DataType::Int16) + c.cast(&DataType::Int16) } else if min >= i32::MIN as i64 && max <= i32::MAX as i64 { - s.cast(&DataType::Int32) + c.cast(&DataType::Int32) } else { - Ok(s.clone()) + Ok(c) } } - .map(Column::from) } From 18a34cb32be1c8bf3a652b937414a04ee14fb931 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 16:34:45 +0200 Subject: [PATCH 35/42] more scalar-opt removals --- crates/polars-core/src/frame/mod.rs | 20 ++++--- crates/polars-core/src/scalar/mod.rs | 1 + .../src/executors/group_by.rs | 10 +--- .../src/executors/group_by_dynamic.rs | 7 +-- .../src/executors/group_by_partitioned.rs | 2 - .../src/executors/group_by_rolling.rs | 4 +- .../src/dsl/function_expr/bounds.rs | 42 +++++++-------- .../src/dsl/function_expr/round.rs | 54 ++++++++++++++----- .../polars-python/src/conversion/any_value.rs | 18 +++++-- 9 files changed, 91 insertions(+), 67 deletions(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 1af40d9fbbbe..55bb2aed8fc8 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -3064,14 +3064,8 @@ impl DataFrame { let mut count = 0; for s in &self.columns { if cols.contains(s.name()) { - let ca = s.as_materialized_series().struct_()?.clone(); - // @scalar-opt - new_cols.extend_from_slice( - &ca.fields_as_series() - .into_iter() - .map(Column::from) - .collect::>(), - ); + let ca = s.struct_()?.clone(); + new_cols.extend(ca.fields_as_series().into_iter().map(Column::from)); count += 1; } else { new_cols.push(s.clone()) @@ -3179,6 +3173,16 @@ fn ensure_can_extend(left: &Series, right: &Series) -> PolarsResult<()> { Ok(()) } +// utility to test if we can vstack/extend the columns +fn ensure_can_extend_cols(left: &Column, right: &Column) -> PolarsResult<()> { + polars_ensure!( + left.name() == right.name(), + ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}", + left.name(), right.name(), + ); + Ok(()) +} + #[cfg(test)] mod test { use super::*; diff --git a/crates/polars-core/src/scalar/mod.rs b/crates/polars-core/src/scalar/mod.rs index 2e762eeb7a32..549a25cef581 100644 --- a/crates/polars-core/src/scalar/mod.rs +++ b/crates/polars-core/src/scalar/mod.rs @@ -1,4 +1,5 @@ pub mod reduce; +mod from; use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde")] diff --git a/crates/polars-mem-engine/src/executors/group_by.rs b/crates/polars-mem-engine/src/executors/group_by.rs index 230c6e3a475f..1ae612f64d67 100644 --- a/crates/polars-mem-engine/src/executors/group_by.rs +++ b/crates/polars-mem-engine/src/executors/group_by.rs @@ -88,14 +88,8 @@ pub(super) fn group_by_helper( rayon::join(get_columns, get_agg) }); - let agg_columns = agg_columns?; - // @scalar-opt - let agg_columns = agg_columns - .into_iter() - .map(Column::from) - .collect::>(); - - columns.extend_from_slice(&agg_columns); + + columns.extend(agg_columns?.into_iter().map(Column::from)); DataFrame::new(columns) } diff --git a/crates/polars-mem-engine/src/executors/group_by_dynamic.rs b/crates/polars-mem-engine/src/executors/group_by_dynamic.rs index e38ad7d5022c..b5f98666d281 100644 --- a/crates/polars-mem-engine/src/executors/group_by_dynamic.rs +++ b/crates/polars-mem-engine/src/executors/group_by_dynamic.rs @@ -59,16 +59,11 @@ impl GroupByDynamicExec { } let agg_columns = evaluate_aggs(&df, &self.aggs, groups, state)?; - // @scalar-opt - let agg_columns = agg_columns - .into_iter() - .map(Column::from) - .collect::>(); let mut columns = Vec::with_capacity(agg_columns.len() + 1 + keys.len()); columns.extend_from_slice(&keys); columns.push(time_key); - columns.extend_from_slice(&agg_columns); + columns.extend(agg_columns.into_iter().map(Column::from)); DataFrame::new(columns) } diff --git a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs index 658b259f0809..83c6ec2e5bda 100644 --- a/crates/polars-mem-engine/src/executors/group_by_partitioned.rs +++ b/crates/polars-mem-engine/src/executors/group_by_partitioned.rs @@ -303,8 +303,6 @@ impl PartitionGroupByExec { acc }) .unwrap(); - // @scalar-opt - let keys = keys.into_iter().map(Column::from).collect(); // the partitioned group_by has added columns so we must update the schema. state.set_schema(self.output_schema.clone()); diff --git a/crates/polars-mem-engine/src/executors/group_by_rolling.rs b/crates/polars-mem-engine/src/executors/group_by_rolling.rs index 3e84740ea92d..8ad2352572a0 100644 --- a/crates/polars-mem-engine/src/executors/group_by_rolling.rs +++ b/crates/polars-mem-engine/src/executors/group_by_rolling.rs @@ -81,13 +81,11 @@ impl GroupByRollingExec { }; let agg_columns = evaluate_aggs(&df, &self.aggs, groups, state)?; - // @scalar-opt - let agg_columns: Vec = agg_columns.into_iter().map(Column::from).collect(); let mut columns = Vec::with_capacity(agg_columns.len() + 1 + keys.len()); columns.extend_from_slice(&keys); columns.push(time_key); - columns.extend_from_slice(&agg_columns); + columns.extend(agg_columns.into_iter().map(Column::from)); DataFrame::new(columns) } diff --git a/crates/polars-plan/src/dsl/function_expr/bounds.rs b/crates/polars-plan/src/dsl/function_expr/bounds.rs index fb589cfbfb4d..77c8a6f3ef5f 100644 --- a/crates/polars-plan/src/dsl/function_expr/bounds.rs +++ b/crates/polars-plan/src/dsl/function_expr/bounds.rs @@ -1,24 +1,23 @@ use super::*; pub(super) fn upper_bound(s: &Column) -> PolarsResult { - // @scalar-opt let name = s.name().clone(); use DataType::*; let s = match s.dtype().to_physical() { #[cfg(feature = "dtype-i8")] - Int8 => Column::new(name, &[i8::MAX]), + Int8 => Column::new_scalar(name, Scalar::from(i8::MAX), 1), #[cfg(feature = "dtype-i16")] - Int16 => Column::new(name, &[i16::MAX]), - Int32 => Column::new(name, &[i32::MAX]), - Int64 => Column::new(name, &[i64::MAX]), + Int16 => Column::new_scalar(name, Scalar::from(i16::MAX), 1), + Int32 => Column::new_scalar(name, Scalar::from(i32::MAX), 1), + Int64 => Column::new_scalar(name, Scalar::from(i64::MAX), 1), #[cfg(feature = "dtype-u8")] - UInt8 => Column::new(name, &[u8::MAX]), + UInt8 => Column::new_scalar(name, Scalar::from(u8::MAX), 1), #[cfg(feature = "dtype-u16")] - UInt16 => Column::new(name, &[u16::MAX]), - UInt32 => Column::new(name, &[u32::MAX]), - UInt64 => Column::new(name, &[u64::MAX]), - Float32 => Column::new(name, &[f32::INFINITY]), - Float64 => Column::new(name, &[f64::INFINITY]), + UInt16 => Column::new_scalar(name, Scalar::from(u16::MAX), 1), + UInt32 => Column::new_scalar(name, Scalar::from(u32::MAX), 1), + UInt64 => Column::new_scalar(name, Scalar::from(u64::MAX), 1), + Float32 => Column::new_scalar(name, Scalar::from(f32::INFINITY), 1), + Float64 => Column::new_scalar(name, Scalar::from(f64::INFINITY), 1), dt => polars_bail!( ComputeError: "cannot determine upper bound for dtype `{}`", dt, ), @@ -27,24 +26,23 @@ pub(super) fn upper_bound(s: &Column) -> PolarsResult { } pub(super) fn lower_bound(s: &Column) -> PolarsResult { - // @scalar-opt let name = s.name().clone(); use DataType::*; let s = match s.dtype().to_physical() { #[cfg(feature = "dtype-i8")] - Int8 => Column::new(name, &[i8::MIN]), + Int8 => Column::new_scalar(name, Scalar::from(i8::MIN), 1), #[cfg(feature = "dtype-i16")] - Int16 => Column::new(name, &[i16::MIN]), - Int32 => Column::new(name, &[i32::MIN]), - Int64 => Column::new(name, &[i64::MIN]), + Int16 => Column::new_scalar(name, Scalar::from(i16::MIN), 1), + Int32 => Column::new_scalar(name, Scalar::from(i32::MIN), 1), + Int64 => Column::new_scalar(name, Scalar::from(i64::MIN), 1), #[cfg(feature = "dtype-u8")] - UInt8 => Column::new(name, &[u8::MIN]), + UInt8 => Column::new_scalar(name, Scalar::from(u8::MIN), 1), #[cfg(feature = "dtype-u16")] - UInt16 => Column::new(name, &[u16::MIN]), - UInt32 => Column::new(name, &[u32::MIN]), - UInt64 => Column::new(name, &[u64::MIN]), - Float32 => Column::new(name, &[f32::NEG_INFINITY]), - Float64 => Column::new(name, &[f64::NEG_INFINITY]), + UInt16 => Column::new_scalar(name, Scalar::from(u16::MIN), 1), + UInt32 => Column::new_scalar(name, Scalar::from(u32::MIN), 1), + UInt64 => Column::new_scalar(name, Scalar::from(u64::MIN), 1), + Float32 => Column::new_scalar(name, Scalar::from(f32::NEG_INFINITY), 1), + Float64 => Column::new_scalar(name, Scalar::from(f64::NEG_INFINITY), 1), dt => polars_bail!( ComputeError: "cannot determine lower bound for dtype `{}`", dt, ), diff --git a/crates/polars-plan/src/dsl/function_expr/round.rs b/crates/polars-plan/src/dsl/function_expr/round.rs index 6858abce9b8c..d13e25407b54 100644 --- a/crates/polars-plan/src/dsl/function_expr/round.rs +++ b/crates/polars-plan/src/dsl/function_expr/round.rs @@ -1,23 +1,49 @@ +use polars_core::frame::column::ScalarColumn; + use super::*; -pub(super) fn round(s: &Column, decimals: u32) -> PolarsResult { - // @scalar-opt - s.as_materialized_series().round(decimals).map(Column::from) +pub(super) fn round(c: &Column, decimals: u32) -> PolarsResult { + match c { + Column::Series(s) => s.round(decimals).map(Column::from), + Column::Scalar(s) if s.is_empty() => s.as_materialized_series().round(decimals).map(Column::from), + Column::Scalar(s) => ScalarColumn::from_single_value_series( + s.as_single_value_series().round(decimals)?, + s.len(), + ) + .map(Column::from), + } } -pub(super) fn round_sig_figs(s: &Column, digits: i32) -> PolarsResult { - // @scalar-opt - s.as_materialized_series() - .round_sig_figs(digits) - .map(Column::from) +pub(super) fn round_sig_figs(c: &Column, digits: i32) -> PolarsResult { + match c { + Column::Series(s) => s.round_sig_figs(digits).map(Column::from), + Column::Scalar(s) if s.is_empty() => s.as_materialized_series().round_sig_figs(digits).map(Column::from), + Column::Scalar(s) => ScalarColumn::from_single_value_series( + s.as_single_value_series().round_sig_figs(digits)?, + s.len(), + ) + .map(Column::from), + } } -pub(super) fn floor(s: &Column) -> PolarsResult { - // @scalar-opt - s.as_materialized_series().floor().map(Column::from) +pub(super) fn floor(c: &Column) -> PolarsResult { + match c { + Column::Series(s) => s.floor().map(Column::from), + Column::Scalar(s) if s.is_empty() => s.as_materialized_series().floor().map(Column::from), + Column::Scalar(s) => { + ScalarColumn::from_single_value_series(s.as_single_value_series().floor()?, s.len()) + .map(Column::from) + }, + } } -pub(super) fn ceil(s: &Column) -> PolarsResult { - // @scalar-opt - s.as_materialized_series().ceil().map(Column::from) +pub(super) fn ceil(c: &Column) -> PolarsResult { + match c { + Column::Series(s) => s.ceil().map(Column::from), + Column::Scalar(s) if s.is_empty() => s.as_materialized_series().ceil().map(Column::from), + Column::Scalar(s) => { + ScalarColumn::from_single_value_series(s.as_single_value_series().ceil()?, s.len()) + .map(Column::from) + }, + } } diff --git a/crates/polars-python/src/conversion/any_value.rs b/crates/polars-python/src/conversion/any_value.rs index 034b8169cebf..70cfaaf6d3ab 100644 --- a/crates/polars-python/src/conversion/any_value.rs +++ b/crates/polars-python/src/conversion/any_value.rs @@ -80,9 +80,13 @@ pub(crate) fn any_value_into_py_object(av: AnyValue, py: Python) -> PyObject { AnyValue::Datetime(v, time_unit, time_zone) => { datetime_to_py_object(py, utils, v, time_unit, time_zone) }, - AnyValue::DatetimeOwned(v, time_unit, time_zone) => { - datetime_to_py_object(py, utils, v, time_unit, time_zone.as_ref().map(AsRef::as_ref)) - }, + AnyValue::DatetimeOwned(v, time_unit, time_zone) => datetime_to_py_object( + py, + utils, + v, + time_unit, + time_zone.as_ref().map(AsRef::as_ref), + ), AnyValue::Duration(v, time_unit) => { let time_delta = elapsed_offset_to_timedelta(v, time_unit); time_delta.into_py(py) @@ -123,7 +127,13 @@ pub(crate) fn any_value_into_py_object(av: AnyValue, py: Python) -> PyObject { } } -fn datetime_to_py_object(py: Python, utils: &Bound, v: i64, tu: TimeUnit, tz: Option<&TimeZone>) -> PyObject { +fn datetime_to_py_object( + py: Python, + utils: &Bound, + v: i64, + tu: TimeUnit, + tz: Option<&TimeZone>, +) -> PyObject { if let Some(time_zone) = tz { // When https://github.com/pola-rs/polars/issues/16199 is // implemented, we'll switch to something like: From 733421dc595055ea2697a9a1b06d47c7f420ec31 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 16:35:08 +0200 Subject: [PATCH 36/42] fmt --- crates/polars-core/src/frame/column/mod.rs | 6 ++++-- crates/polars-core/src/scalar/mod.rs | 2 +- crates/polars-plan/src/dsl/function_expr/round.rs | 9 +++++++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index 8bfe6140d663..4acf57f4b2d6 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -787,9 +787,11 @@ impl Column { } pub fn extend_constant(&self, value: AnyValue, n: usize) -> PolarsResult { - self.as_materialized_series().extend_constant(value, n).map(Column::from) + self.as_materialized_series() + .extend_constant(value, n) + .map(Column::from) // @scalar-opt: This currently fails because Scalar::partial_cmp cannot deal with Nulls - // + // // match self { // Column::Series(s) => s.extend_constant(value, n).map(Column::from), // Column::Scalar(s) => { diff --git a/crates/polars-core/src/scalar/mod.rs b/crates/polars-core/src/scalar/mod.rs index 549a25cef581..3e456837e534 100644 --- a/crates/polars-core/src/scalar/mod.rs +++ b/crates/polars-core/src/scalar/mod.rs @@ -1,5 +1,5 @@ -pub mod reduce; mod from; +pub mod reduce; use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde")] diff --git a/crates/polars-plan/src/dsl/function_expr/round.rs b/crates/polars-plan/src/dsl/function_expr/round.rs index d13e25407b54..41b2f04324d0 100644 --- a/crates/polars-plan/src/dsl/function_expr/round.rs +++ b/crates/polars-plan/src/dsl/function_expr/round.rs @@ -5,7 +5,9 @@ use super::*; pub(super) fn round(c: &Column, decimals: u32) -> PolarsResult { match c { Column::Series(s) => s.round(decimals).map(Column::from), - Column::Scalar(s) if s.is_empty() => s.as_materialized_series().round(decimals).map(Column::from), + Column::Scalar(s) if s.is_empty() => { + s.as_materialized_series().round(decimals).map(Column::from) + }, Column::Scalar(s) => ScalarColumn::from_single_value_series( s.as_single_value_series().round(decimals)?, s.len(), @@ -17,7 +19,10 @@ pub(super) fn round(c: &Column, decimals: u32) -> PolarsResult { pub(super) fn round_sig_figs(c: &Column, digits: i32) -> PolarsResult { match c { Column::Series(s) => s.round_sig_figs(digits).map(Column::from), - Column::Scalar(s) if s.is_empty() => s.as_materialized_series().round_sig_figs(digits).map(Column::from), + Column::Scalar(s) if s.is_empty() => s + .as_materialized_series() + .round_sig_figs(digits) + .map(Column::from), Column::Scalar(s) => ScalarColumn::from_single_value_series( s.as_single_value_series().round_sig_figs(digits)?, s.len(), From c9b272387cef460240769e895b52388af76f9adb Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 16:39:38 +0200 Subject: [PATCH 37/42] fix small errors --- crates/polars-core/src/frame/mod.rs | 27 +++------------------------ crates/polars-core/src/scalar/from.rs | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 24 deletions(-) create mode 100644 crates/polars-core/src/scalar/from.rs diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 55bb2aed8fc8..633e3ef48c10 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -197,8 +197,8 @@ impl DataFrame { /// /// FFI buffers are included in this estimation. pub fn estimated_size(&self) -> usize { - self.materialized_column_iter() - .map(|s| s.estimated_size()) + self.columns.iter() + .map(Column::estimated_size) .sum() } @@ -973,10 +973,6 @@ impl DataFrame { .iter_mut() .zip(other.columns.iter()) .try_for_each::<_, PolarsResult<_>>(|(left, right)| { - // @scalar-opt - let left = left.into_materialized_series(); - let right = right.as_materialized_series(); - ensure_can_extend(&*left, right)?; left.append(right)?; Ok(()) @@ -995,10 +991,6 @@ impl DataFrame { .iter_mut() .zip(other.columns.iter()) .for_each(|(left, right)| { - // @scalar-opt - let left = left.into_materialized_series(); - let right = right.as_materialized_series(); - left.append(right).expect("should not fail"); }); } @@ -1028,9 +1020,6 @@ impl DataFrame { .iter_mut() .zip(other.columns.iter()) .try_for_each::<_, PolarsResult<_>>(|(left, right)| { - let left = left.into_materialized_series(); - let right = right.as_materialized_series(); - ensure_can_extend(&*left, right)?; left.extend(right)?; Ok(()) @@ -3164,17 +3153,7 @@ impl From for Vec { } // utility to test if we can vstack/extend the columns -fn ensure_can_extend(left: &Series, right: &Series) -> PolarsResult<()> { - polars_ensure!( - left.name() == right.name(), - ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}", - left.name(), right.name(), - ); - Ok(()) -} - -// utility to test if we can vstack/extend the columns -fn ensure_can_extend_cols(left: &Column, right: &Column) -> PolarsResult<()> { +fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> { polars_ensure!( left.name() == right.name(), ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}", diff --git a/crates/polars-core/src/scalar/from.rs b/crates/polars-core/src/scalar/from.rs new file mode 100644 index 000000000000..35345b2a6527 --- /dev/null +++ b/crates/polars-core/src/scalar/from.rs @@ -0,0 +1,27 @@ +use super::{AnyValue, DataType, Scalar}; + +macro_rules! impl_from { + ($(($t:ty, $av:ident, $dt:ident))+) => { + $( + impl From<$t> for Scalar { + #[inline] + fn from(v: $t) -> Self { + Self::new(DataType::$dt, AnyValue::$av(v)) + } + } + )+ + } +} + +impl_from! { + (i8, Int8, Int8) + (i16, Int16, Int16) + (i32, Int32, Int32) + (i64, Int64, Int64) + (u8, UInt8, UInt8) + (u16, UInt16, UInt16) + (u32, UInt32, UInt32) + (u64, UInt64, UInt64) + (f32, Float32, Float32) + (f64, Float64, Float64) +} From 801dfe05708b299daa56cadabcad61b46bc1c65b Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 16:40:49 +0200 Subject: [PATCH 38/42] fmt --- crates/polars-core/src/frame/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 633e3ef48c10..307843afbfa6 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -197,9 +197,7 @@ impl DataFrame { /// /// FFI buffers are included in this estimation. pub fn estimated_size(&self) -> usize { - self.columns.iter() - .map(Column::estimated_size) - .sum() + self.columns.iter().map(Column::estimated_size).sum() } // Reduce monomorphization. From 834ec503abacaab8172fc1c8400c118377f9fca5 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 16:43:11 +0200 Subject: [PATCH 39/42] feature gate datetime --- crates/polars-core/src/fmt.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 9e60fb2b9800..c930b9e94da7 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -907,6 +907,7 @@ fn fmt_float(f: &mut Formatter<'_>, width: usize, v: T) -> fmt } } +#[cfg(feature = "dtype-datetime")] fn fmt_datetime( f: &mut Formatter<'_>, v: i64, From 81986620a0a61260eebb5aa98208b8b96b469bc2 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 17:46:05 +0200 Subject: [PATCH 40/42] remove resize constraint --- crates/polars-core/src/frame/column/mod.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index 4acf57f4b2d6..2296b09a4a03 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -1113,14 +1113,8 @@ impl ScalarColumn { /// Resize the [`ScalarColumn`] to new `length`. /// /// This reuses the materialized [`Series`], if `length <= self.length`. - /// - /// # Panics - /// - /// This panics if `self.length == 0`. pub fn resize(&self, length: usize) -> ScalarColumn { - assert_ne!(self.length, 0); - - let mut sliced = Self { + let mut resized = Self { name: self.name.clone(), scalar: self.scalar.clone(), length, @@ -1129,12 +1123,12 @@ impl ScalarColumn { if self.length >= length { if let Some(materialized) = self.materialized.get() { - sliced.materialized = OnceLock::from(materialized.head(Some(length))); - debug_assert_eq!(sliced.materialized.get().unwrap().len(), length); + resized.materialized = OnceLock::from(materialized.head(Some(length))); + debug_assert_eq!(resized.materialized.get().unwrap().len(), length); } } - sliced + resized } pub fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult { From 581f94d33ba70014bd0cacbac6590ca71d12be2b Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 17:51:50 +0200 Subject: [PATCH 41/42] fix doc comment --- crates/polars-core/src/frame/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 307843afbfa6..84bd07bb3f28 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -2065,7 +2065,7 @@ impl DataFrame { /// let mut df = DataFrame::new(vec![s0, s1])?; /// /// // Add 32 to get lowercase ascii values - /// df.apply_at_idx(1, |s| s + 32); + /// df.apply_at_idx(1, |s| (s + 32).unwrap()); /// # Ok::<(), PolarsError>(()) /// ``` /// Results in: From 015ca791d3f5007980f0e03fbf6d6b1077742c6d Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 13 Sep 2024 17:56:56 +0200 Subject: [PATCH 42/42] remove widely used materializations --- crates/polars-core/src/frame/mod.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 84bd07bb3f28..a72da39d915d 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -575,7 +575,8 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn schema(&self) -> Schema { - self.materialized_column_iter() + self.columns + .iter() .map(|x| (x.name().clone(), x.dtype().clone())) .collect() } @@ -1327,13 +1328,7 @@ impl DataFrame { None => return None, } // SAFETY: we just checked bounds - unsafe { - Some( - self.materialized_column_iter() - .map(|s| s.get_unchecked(idx)) - .collect(), - ) - } + unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) } } /// Select a [`Series`] by index. @@ -2014,7 +2009,7 @@ impl DataFrame { /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]); /// let mut df = DataFrame::new(vec![s0, s1])?; /// - /// fn str_to_len(str_val: &Series) -> Series { + /// fn str_to_len(str_val: &Column) -> Column { /// str_val.str() /// .unwrap() /// .into_iter() @@ -2022,7 +2017,7 @@ impl DataFrame { /// opt_name.map(|name: &str| name.len() as u32) /// }) /// .collect::() - /// .into_series() + /// .into_column() /// } /// /// // Replace the names column by the length of the names. @@ -2046,11 +2041,11 @@ impl DataFrame { /// ``` pub fn apply(&mut self, name: &str, f: F) -> PolarsResult<&mut Self> where - F: FnOnce(&Series) -> C, + F: FnOnce(&Column) -> C, C: IntoColumn, { let idx = self.check_name_to_idx(name)?; - self.apply_at_idx(idx, |c| f(c.as_materialized_series())) + self.apply_at_idx(idx, f) } /// Apply a closure to a column at index `idx`. This is the recommended way to do in place